Merge branch 'graph_cleaning'

This commit is contained in:
Claudio Atzori 2020-06-12 11:40:25 +02:00
commit 4bcad1c9c3
26 changed files with 4134 additions and 56 deletions

View File

@ -14,6 +14,7 @@ public class ModelConstants {
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
public static final String DNET_COUNTRY_TYPE = "dnet:countries";
public static final String DNET_REVIEW_LEVELS = "dnet:review_levels";
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";

View File

@ -31,7 +31,7 @@ public class Instance implements Serializable {
// typed results
private Field<String> processingchargecurrency;
private Field<String> refereed; // peer-review status
private Qualifier refereed; // peer-review status
public Field<String> getLicense() {
return license;
@ -113,11 +113,11 @@ public class Instance implements Serializable {
this.processingchargecurrency = processingchargecurrency;
}
public Field<String> getRefereed() {
public Qualifier getRefereed() {
return refereed;
}
public void setRefereed(Field<String> refereed) {
public void setRefereed(Qualifier refereed) {
this.refereed = refereed;
}

View File

@ -254,28 +254,25 @@ public class Result extends OafEntity implements Serializable {
final StructuredProperty p = baseMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
}
//
//
// title.remove(baseMainTitle);
}
StructuredProperty newMainTitle = null;
if (r.getTitle() != null) {
newMainTitle = getMainTitle(r.getTitle());
if (newMainTitle != null) {
if (newMainTitle != null && title != null) {
final StructuredProperty p = newMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
}
// r.getTitle().remove(newMainTitle);
}
if (newMainTitle != null && compareTrust(this, r) < 0)
if (newMainTitle != null && compareTrust(this, r) < 0) {
baseMainTitle = newMainTitle;
}
title = mergeLists(title, r.getTitle());
if (title != null && baseMainTitle != null)
if (title != null && baseMainTitle != null) {
title.add(baseMainTitle);
}
relevantdate = mergeLists(relevantdate, r.getRelevantdate());

View File

@ -96,12 +96,21 @@ public class ProtoConverter implements Serializable {
.stream()
.distinct()
.collect(Collectors.toCollection(ArrayList::new)) : null);
i.setRefereed(mapStringField(ri.getRefereed()));
i.setRefereed(mapRefereed(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i;
}
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
Qualifier q = new Qualifier();
q.setClassid(refereed.getValue());
q.setSchemename(refereed.getValue());
q.setSchemeid("dnet:review_levels");
q.setSchemename("dnet:review_levels");
return q;
}
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getExternalReferenceCount() > 0) {

View File

@ -8,6 +8,7 @@ import java.io.File;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
@ -19,6 +20,7 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
@Disabled
public class DnetCollectorWorkerApplicationTests {
private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class);

View File

@ -166,8 +166,10 @@ case object Crossref2Oaf {
val has_review = (json \ "relation" \"has-review" \ "id")
if(has_review != JNothing)
instance.setRefereed(asField("peerReviewed"))
if(has_review != JNothing) {
instance.setRefereed(
createQualifier("0001", "peerReviewed", "dnet:review_levels", "dnet:review_levels"))
}
instance.setAccessright(getRestrictedQualifier())

View File

@ -0,0 +1,119 @@
package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.beans.BeanInfo;
import java.beans.IntrospectionException;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Predef;
public class CleanGraphProperties {
private static final Logger log = LoggerFactory.getLogger(CleanGraphProperties.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CleanGraphProperties.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
fixGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
});
}
private static <T extends Oaf> void fixGraphTable(
SparkSession spark,
VocabularyGroup vocs,
String inputPath,
Class<T> clazz,
String outputPath) {
CleaningRule<T> rule = new CleaningRule<>(vocs);
readTableFromPath(spark, inputPath, clazz)
.map(rule, Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
}
private static <T extends Oaf> Dataset<T> readTableFromPath(
SparkSession spark, String inputEntityPath, Class<T> clazz) {
log.info("Reading Graph table from: {}", inputEntityPath);
return spark
.read()
.textFile(inputEntityPath)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
Encoders.bean(clazz));
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -0,0 +1,73 @@
package eu.dnetlib.dhp.oa.graph.clean;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
private VocabularyGroup vocabularies;
private Map<Class, Function<Object, Object>> mapping = Maps.newHashMap();
public CleaningRule(VocabularyGroup vocabularies) {
this.vocabularies = vocabularies;
mapping.put(Qualifier.class, o -> patchQualifier(o));
mapping.put(StructuredProperty.class, o -> patchSp(o));
mapping.put(Field.class, o -> patchStringField(o));
}
@Override
public T call(T value) throws Exception {
OafNavigator.apply(value, mapping);
return value;
}
private Object patchQualifier(Object o) {
Qualifier q = (Qualifier) o;
if (vocabularies.vocabularyExists(q.getSchemeid())) {
return vocabularies.lookup(q.getSchemeid(), q.getClassid());
}
return o;
}
private Object patchSp(Object o) {
StructuredProperty sp = (StructuredProperty) o;
if (StringUtils.isBlank(sp.getValue())) {
return null;
}
return o;
}
private Object patchStringField(Object o) {
Field f = (Field) o;
try {
if (StringUtils.isBlank((String) f.getValue())) {
return null;
}
} catch (ClassCastException e) {
// ignored on purpose
}
return o;
}
public VocabularyGroup getVocabularies() {
return vocabularies;
}
}

View File

@ -0,0 +1,151 @@
package eu.dnetlib.dhp.oa.graph.clean;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import scala.Tuple2;
import java.beans.BeanInfo;
import java.beans.IntrospectionException;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.function.Function;
public class OafNavigator {
public static <E extends Oaf> E apply(E oaf, Map<Class, Function<Object, Object>> mapping) {
reflect(oaf, mapping);
return oaf;
}
public static void reflect(Object o, Map<Class, Function<Object, Object>> mapping) {
visit(o, mapping);
}
public static void visit(final Object thingy, Map<Class, Function<Object, Object>> mapping) {
try {
final Class<?> clazz = thingy.getClass();
if (!isPrimitive(thingy) && clazz.getPackage().equals(Oaf.class.getPackage())) {
final BeanInfo beanInfo = Introspector.getBeanInfo(clazz);
for (final PropertyDescriptor descriptor : beanInfo.getPropertyDescriptors()) {
try {
final Object value = descriptor.getReadMethod().invoke(thingy);
if (value != null && !isPrimitive(value)) {
System.out.println("VISITING " + descriptor.getName() + " " + descriptor.getPropertyType());
if (Iterable.class.isAssignableFrom(descriptor.getPropertyType())) {
for(Object vi : (Iterable) value) {
visit(vi, mapping);
}
} else {
if (mapping.keySet().contains(value.getClass())) {
final Object newValue = mapping.get(value.getClass()).apply(value);
System.out.println("PATCHING " + descriptor.getName()+ " " + descriptor.getPropertyType());
System.out.println("OLD VALUE " + getObjectMapper().writeValueAsString(value));
System.out.println("NEW VALUE " + getObjectMapper().writeValueAsString(newValue));
descriptor.getWriteMethod().invoke(newValue);
}
visit(value, mapping);
}
}
} catch (final IllegalArgumentException e) {
// handle this please
} catch (final IllegalAccessException e) {
// and this also
} catch (final InvocationTargetException e) {
// and this, too
} catch (JsonProcessingException e) {
e.printStackTrace();
}
}
}
} catch (final IntrospectionException e) {
// do something sensible here
}
}
private static ObjectMapper getObjectMapper() {
final ObjectMapper mapper = new ObjectMapper();
return mapper;
}
private static void navigate(Object o, Map<Class, Function<Object, Object>> mapping) {
if (Objects.isNull(o) || isPrimitive(o)) {
return;
} else {
try {
for (Field field : getAllFields(o.getClass())) {
System.out.println(field.getName());
field.setAccessible(true);
Object value = field.get(o);
if (Objects.nonNull(value)) {
final Class<?> fieldType = field.getType();
if ((fieldType.isArray() && !fieldType.getComponentType().isPrimitive())) {
Object[] fs = (Object[]) value;
for (Object fi : fs) {
navigate(fi, mapping);
}
} if (Iterable.class.isAssignableFrom(fieldType)) {
Iterable fs = (Iterable) value;
for (Object fi : fs) {
navigate(fi, mapping);
}
} else {
if (mapping.keySet().contains(value.getClass())) {
System.out.println("PATCHING " + field.getName());
field.set(o, mapping.get(value.getClass()).apply(value));
}
}
}
}
} catch (IllegalAccessException | IllegalArgumentException e) {
throw new RuntimeException(e);
}
}
}
private static boolean isPrimitive(Object o) {
return o.getClass().isPrimitive()
|| o instanceof Class
|| o instanceof Integer
|| o instanceof Double
|| o instanceof Float
|| o instanceof Long
|| o instanceof Boolean
|| o instanceof String
|| o instanceof Byte;
}
private static List<Field> getAllFields(Class<?> clazz) {
return getAllFields(new LinkedList<>(), clazz);
}
private static List<Field> getAllFields(List<Field> fields, Class<?> clazz) {
fields.addAll(Arrays.asList(clazz.getDeclaredFields()));
final Class<?> superclass = clazz.getSuperclass();
if (Objects.nonNull(superclass) && superclass.getPackage().equals(Oaf.class.getPackage())) {
getAllFields(fields, superclass);
}
return fields;
}
}

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.oa.graph.clean;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.lang.reflect.Field;
import java.util.*;
import java.util.function.Function;
public class OafNavigator2 {
public static <E extends Oaf> E apply(E oaf, Map<Class, Function<Object, Object>> mapping) {
navigate(oaf, mapping);
return oaf;
}
private static void navigate(Object o, Map<Class, Function<Object, Object>> mapping) {
if (Objects.isNull(o) || isPrimitive(o)) {
return;
} else {
try {
for (Field field : getAllFields(o.getClass())) {
System.out.println("VISITING " + field.getName() + " in " + o.getClass());
field.setAccessible(true);
Object value = field.get(o);
if (Objects.nonNull(value)) {
final Class<?> fieldType = field.getType();
if ((fieldType.isArray() && !fieldType.getComponentType().isPrimitive())) {
Object[] fs = (Object[]) value;
for (Object fi : fs) {
navigate(fi, mapping);
}
} if (Iterable.class.isAssignableFrom(fieldType)) {
Iterable fs = (Iterable) value;
for (Object fi : fs) {
navigate(fi, mapping);
}
} else {
final Function<Object, Object> cleaningFn = mapping.get(value.getClass());
if (Objects.nonNull(cleaningFn)) {
final Object newValue = cleaningFn.apply(value);
if (!Objects.equals(value, newValue)) {
System.out.println("PATCHING " + field.getName()+ " " + value.getClass());
System.out.println("OLD VALUE " + getObjectMapper().writeValueAsString(value));
System.out.println("NEW VALUE " + getObjectMapper().writeValueAsString(newValue));
field.set(o, newValue);
}
}
}
}
}
} catch (IllegalAccessException | IllegalArgumentException | JsonProcessingException e) {
throw new RuntimeException(e);
}
}
}
private static ObjectMapper getObjectMapper() {
final ObjectMapper mapper = new ObjectMapper();
return mapper;
}
private static boolean isPrimitive(Object o) {
return o.getClass().isPrimitive()
|| o instanceof Class
|| o instanceof Integer
|| o instanceof Double
|| o instanceof Float
|| o instanceof Long
|| o instanceof Boolean
|| o instanceof String
|| o instanceof Byte;
}
private static List<Field> getAllFields(Class<?> clazz) {
return getAllFields(new LinkedList<>(), clazz);
}
private static List<Field> getAllFields(List<Field> fields, Class<?> clazz) {
fields.addAll(Arrays.asList(clazz.getDeclaredFields()));
final Class<?> superclass = clazz.getSuperclass();
if (Objects.nonNull(superclass) && superclass.getPackage().equals(Oaf.class.getPackage())) {
getAllFields(fields, superclass);
}
return fields;
}
}

View File

@ -39,6 +39,8 @@ import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class GenerateEntitiesApplication {
@ -71,7 +73,8 @@ public class GenerateEntitiesApplication {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupUrl);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@ -139,9 +142,11 @@ public class GenerateEntitiesApplication {
switch (type.toLowerCase()) {
case "oaf-store-claim":
case "oaf-store-cleaned":
case "oaf-store-claim":
return new OafToOafMapper(vocs, false).processMdRecord(s);
case "odf-store-claim":
case "odf-store-cleaned":
case "odf-store-claim":
return new OdfToOafMapper(vocs, false).processMdRecord(s);
case "oaf-store-intersection":
return new OafToOafMapper(vocs, true).processMdRecord(s);

View File

@ -71,6 +71,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
@ -151,7 +152,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
super(hdfsPath);
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.lastUpdateTimestamp = new Date().getTime();
this.vocs = VocabularyGroup.loadVocsFromIS(isLookupUrl);
this.vocs = VocabularyGroup.loadVocsFromIS(ISLookupClientFactory.getLookUpService(isLookupUrl));
}
public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer)

View File

@ -133,7 +133,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
instance
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
instance
.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance

View File

@ -4,19 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENTED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENT_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PART;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SUPPLEMENT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.ArrayList;
import java.util.Arrays;
@ -129,7 +117,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
instance
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));

View File

@ -60,6 +60,10 @@ public class OafMapperUtils {
.collect(Collectors.toList());
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
}
public static Qualifier qualifier(
final String classid,
final String classname,

View File

@ -4,14 +4,29 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class Vocabulary implements Serializable {
private final String id;
private final String name;
/**
* Code to Term mappings for this Vocabulary.
*/
private final Map<String, VocabularyTerm> terms = new HashMap<>();
/**
* Synonym to Code mappings for this Vocabulary.
*/
private final Map<String, String> synonyms = Maps.newHashMap();
public Vocabulary(final String id, final String name) {
this.id = id;
this.name = name;
@ -30,7 +45,7 @@ public class Vocabulary implements Serializable {
}
public VocabularyTerm getTerm(final String id) {
return terms.get(id.toLowerCase());
return Optional.ofNullable(id).map(s -> s.toLowerCase()).map(s -> terms.get(s)).orElse(null);
}
protected void addTerm(final String id, final String name) {
@ -40,4 +55,32 @@ public class Vocabulary implements Serializable {
protected boolean termExists(final String id) {
return terms.containsKey(id.toLowerCase());
}
protected void addSynonym(final String syn, final String termCode) {
synonyms.put(syn, termCode.toLowerCase());
}
public VocabularyTerm getTermBySynonym(final String syn) {
return getTerm(synonyms.get(syn));
}
public Qualifier getTermAsQualifier(final String termId) {
if (StringUtils.isBlank(termId)) {
return OafMapperUtils.unknown(getId(), getName());
} else if (termExists(termId)) {
final VocabularyTerm t = getTerm(termId);
return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
} else {
return OafMapperUtils.qualifier(termId, termId, getId(), getName());
}
}
public Qualifier getSynonymAsQualifier(final String syn) {
return Optional
.ofNullable(getTermBySynonym(syn))
.map(term -> getTermAsQualifier(term.getId()))
.orElse(null);
// .orElse(OafMapperUtils.unknown(getId(), getName()));
}
}

View File

@ -1,33 +1,39 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class VocabularyGroup implements Serializable {
public static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
public static final String VOCABULARIES_XQUERY = "for $x in collection(' /db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') \n"
+
"let $vocid := $x//VOCABULARY_NAME/@code\n" +
"let $vocname := $x//VOCABULARY_NAME/text()\n" +
"for $term in ($x//TERM)\n" +
"return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name)";
final String xquery = IOUtils
.toString(
GenerateEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery"));
public static final String VOCABULARY_SYNONYMS_XQUERY = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')\n"
+
"let $vocid := $x//VOCABULARY_NAME/@code\n" +
"let $vocname := $x//VOCABULARY_NAME/text()\n" +
"for $term in ($x//TERM)\n" +
"for $syn in ($term//SYNONYM/@term)\n" +
"return concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)\n";
public static VocabularyGroup loadVocsFromIS(ISLookUpService isLookUpService) throws ISLookUpException {
final VocabularyGroup vocs = new VocabularyGroup();
for (final String s : isLookUpService.quickSearchProfile(xquery)) {
for (final String s : isLookUpService.quickSearchProfile(VOCABULARIES_XQUERY)) {
final String[] arr = s.split("@=@");
if (arr.length == 4) {
final String vocId = arr[0].trim();
@ -40,6 +46,19 @@ public class VocabularyGroup implements Serializable {
}
vocs.addTerm(vocId, termId, termName);
vocs.addSynonyms(vocId, termId, termId);
}
}
for (final String s : isLookUpService.quickSearchProfile(VOCABULARY_SYNONYMS_XQUERY)) {
final String[] arr = s.split("@=@");
if (arr.length == 3) {
final String vocId = arr[0].trim();
final String termId = arr[1].trim();
final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn);
vocs.addSynonyms(vocId, termId, termId);
}
}
@ -66,16 +85,37 @@ public class VocabularyGroup implements Serializable {
}
}
public Qualifier getTermAsQualifier(final String vocId, final String id) {
if (StringUtils.isBlank(id)) {
return OafMapperUtils.qualifier("UNKNOWN", "UNKNOWN", vocId, vocId);
} else if (termExists(vocId, id)) {
final Vocabulary v = vocs.get(vocId.toLowerCase());
final VocabularyTerm t = v.getTerm(id);
return OafMapperUtils.qualifier(t.getId(), t.getName(), v.getId(), v.getName());
} else {
return OafMapperUtils.qualifier(id, id, vocId, vocId);
public Set<String> getTerms(String vocId) {
if (!vocabularyExists(vocId)) {
return new HashSet<>();
}
return vocs
.get(vocId.toLowerCase())
.getTerms()
.values()
.stream()
.map(t -> t.getId())
.collect(Collectors.toCollection(HashSet::new));
}
public Qualifier lookup(String vocId, String id) {
return Optional
.ofNullable(getSynonymAsQualifier(vocId, id))
.orElse(getTermAsQualifier(vocId, id));
}
public Qualifier getTermAsQualifier(final String vocId, final String id) {
if (vocabularyExists(vocId)) {
return vocs.get(vocId.toLowerCase()).getTermAsQualifier(id);
}
return OafMapperUtils.qualifier(id, id, "", "");
}
public Qualifier getSynonymAsQualifier(final String vocId, final String syn) {
if (StringUtils.isBlank(vocId)) {
return OafMapperUtils.unknown("", "");
}
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
}
public boolean termExists(final String vocId, final String id) {
@ -86,4 +126,16 @@ public class VocabularyGroup implements Serializable {
return vocs.containsKey(vocId.toLowerCase());
}
private void addSynonyms(final String vocId, final String termId, final String syn) {
String id = Optional
.ofNullable(vocId)
.map(s -> s.toLowerCase())
.orElseThrow(
() -> new IllegalArgumentException(String.format("empty vocabulary id for [term:%s, synonym:%s]")));
Optional
.ofNullable(vocs.get(id))
.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))
.addSynonym(syn, termId);
}
}

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,289 @@
<workflow-app name="clean graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphInputPath</name>
<description>the input path to read graph content</description>
</property>
<property>
<name>graphOutputPath</name>
<description>the target path to store cleaned graph</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<start to="fork_clean_graph"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<fork name="fork_clean_graph">
<path start="clean_publication"/>
<path start="clean_dataset"/>
<path start="clean_otherresearchproduct"/>
<path start="clean_software"/>
<path start="clean_datasource"/>
<path start="clean_organization"/>
<path start="clean_project"/>
<path start="clean_relation"/>
</fork>
<action name="clean_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean publications</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean datasets</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean otherresearchproducts</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean softwares</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_datasource">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean datasources</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_organization">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean organizations</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_project">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean projects</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_relation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean relations</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<join name="wait_clean" to="End"/>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "in",
"paramLongName": "inputPath",
"paramDescription": "the path to the graph data dump to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path to store the output graph",
"paramRequired": true
},
{
"paramName": "isu",
"paramLongName": "isLookupUrl",
"paramDescription": "url to the ISLookup Service",
"paramRequired": true
},
{
"paramName": "class",
"paramLongName": "graphTableClassName",
"paramDescription": "class name moelling the graph table",
"paramRequired": true
}
]

View File

@ -0,0 +1,6 @@
for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')
let $vocid := $x//VOCABULARY_NAME/@code
let $vocname := $x//VOCABULARY_NAME/text()
for $term in ($x//TERM)
for $syn in ($term//SYNONYM/@term)
return concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)

View File

@ -0,0 +1,112 @@
package eu.dnetlib.dhp.oa.graph.clean;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.platform.commons.util.StringUtils;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
public class CleaningRuleTest {
public static final ObjectMapper MAPPER = new ObjectMapper();
@Mock
private ISLookUpService isLookUpService;
private VocabularyGroup vocabularies;
private CleaningRule<Publication> cleaningRule;
@BeforeEach
public void setUp() throws ISLookUpException, IOException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
cleaningRule = new CleaningRule(vocabularies);
}
@Test
public void testCleaning() throws Exception {
assertNotNull(cleaningRule.getVocabularies());
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = cleaningRule.call(p_in);
assertNotNull(p_out);
assertEquals("eng", p_out.getLanguage().getClassid());
assertEquals("English", p_out.getLanguage().getClassname());
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
Set<String> pidTerms = vocabularies.getTerms("dnet:pid_types");
assertTrue(
p_out
.getPid()
.stream()
.map(p -> p.getQualifier())
.allMatch(q -> pidTerms.contains(q.getClassid())));
// TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_out));
assertTrue(
p_out
.getPid()
.stream()
.allMatch(sp -> StringUtils.isNotBlank(sp.getValue())));
}
private Stream<Qualifier> getAuthorPidTypes(Publication pub) {
return pub
.getAuthor()
.stream()
.map(a -> a.getPid())
.flatMap(p -> p.stream())
.map(s -> s.getQualifier());
}
private List<String> vocs() throws IOException {
return IOUtils
.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
}
private List<String> synonyms() throws IOException {
return IOUtils
.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
}
}

View File

@ -0,0 +1,757 @@
{
"author": [
{
"affiliation": [
],
"fullname": "Brien, Tom",
"name": "Tom",
"pid": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "ORCID12",
"classname": "ORCID12",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "0000-0001-9613-6639"
}
],
"rank": 1,
"surname": "Brien"
},
{
"affiliation": [
],
"fullname": "Ade, Peter",
"name": "Peter",
"pid": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "xyz",
"classname": "XYZ",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "qwerty"
}
],
"rank": 2,
"surname": "Ade"
},
{
"affiliation": [
],
"fullname": "Barry, Peter S.",
"name": "Peter S.",
"pid": [
],
"rank": 3,
"surname": "Barry"
},
{
"affiliation": [
],
"fullname": "Dunscombe, Chris J.",
"name": "Chris J.",
"pid": [
],
"rank": 4,
"surname": "Dunscombe"
},
{
"affiliation": [
],
"fullname": "Leadley, David R.",
"name": "David R.",
"pid": [
],
"rank": 5,
"surname": "Leadley"
},
{
"affiliation": [
],
"fullname": "Morozov, Dmitry V.",
"name": "Dmitry V.",
"pid": [
],
"rank": 6,
"surname": "Morozov"
},
{
"affiliation": [
],
"fullname": "Myronov, Maksym",
"name": "Maksym",
"pid": [
],
"rank": 7,
"surname": "Myronov"
},
{
"affiliation": [
],
"fullname": "Parker, Evan",
"name": "Evan",
"pid": [
],
"rank": 8,
"surname": "Parker"
},
{
"affiliation": [
],
"fullname": "Prest, Martin J.",
"name": "Martin J.",
"pid": [
],
"rank": 9,
"surname": "Prest"
},
{
"affiliation": [
],
"fullname": "Prunnila, Mika",
"name": "Mika",
"pid": [
],
"rank": 10,
"surname": "Prunnila"
},
{
"affiliation": [
],
"fullname": "Sudiwala, Rashmi V.",
"name": "Rashmi V.",
"pid": [
],
"rank": 11,
"surname": "Sudiwala"
},
{
"affiliation": [
],
"fullname": "Whall, Terry E.",
"name": "Terry E.",
"pid": [
],
"rank": 12,
"surname": "Whall"
},
{
"affiliation": [
],
"fullname": "Mauskopf",
"name": "",
"pid": [
],
"rank": 13,
"surname": ""
},
{
"affiliation": [
],
"fullname": " P. D. ",
"name": "",
"pid": [
],
"rank": 14,
"surname": ""
}
],
"bestaccessright": {
"classid": "CLOSED",
"classname": "Closed Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"collectedfrom": [
{
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
"value": "VIRTA"
}
],
"context": [
],
"contributor": [
],
"country": [
],
"coverage": [
],
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"dateofacceptance": {
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"value": "2016-01-01"
},
"dateofcollection": "",
"dateoftransformation": "2020-04-22T12:34:08.009Z",
"description": [
],
"externalReference": [
],
"extraInfo": [
],
"format": [
],
"fulltext": [
],
"id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375",
"instance": [
{
"accessright": {
"classid": "CLOSED",
"classname": "CLOSED",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"collectedfrom": {
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
"value": "VIRTA"
},
"dateofacceptance": {
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"value": "2016-01-01"
},
"distributionlocation": "",
"hostedby": {
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
"value": "VIRTA"
},
"instancetype": {
"classid": "Comentario",
"classname": "Comentario",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
},
"url": [
"http://juuli.fi/Record/0275158616",
"http://dx.doi.org/10.1007/s109090161569x"
]
}
],
"journal": {
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"edition": "",
"ep": " 7",
"iss": "9 March",
"issnLinking": "",
"issnOnline": "",
"issnPrinted": "0022-2291",
"name": "Journal of Low Temperature Physics - Early Acces",
"sp": "1 ",
"vol": ""
},
"language": {
"classid": "en",
"classname": "en",
"schemeid": "dnet:languages",
"schemename": "dnet:languages"
},
"lastupdatetimestamp": 1591283286319,
"oaiprovenance": {
"originDescription": {
"altered": true,
"baseURL": "https%3A%2F%2Fvirta-jtp.csc.fi%2Fapi%2Fcerif",
"datestamp": "2019-07-30",
"harvestDate": "2020-04-22T11:04:38.685Z",
"identifier": "oai:virta-jtp.csc.fi:Publications/0275158616",
"metadataNamespace": ""
}
},
"originalId": [
"CSC_________::2250a70c903c6ac6e4c01438259e9375"
],
"pid": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1007/s109090161569x"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1007/s109090161569x"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": ""
}
],
"relevantdate": [
],
"resourcetype": {
"classid": "0001",
"classname": "0001",
"schemeid": "dnet:dataCite_resource",
"schemename": "dnet:dataCite_resource"
},
"resulttype": {
"classid": "publication",
"classname": "publication",
"schemeid": "dnet:result_typologies",
"schemename": "dnet:result_typologies"
},
"source": [
],
"subject": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "ta213"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "infrared detectors"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "lens antennas"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "silicon"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "slot antennas"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "strained silicon"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "cold electron bolometers"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "doped silicon"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "measure noise"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "noise equivalent power"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "optical characterisation"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "optical response"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "photon noise"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "silicon absorbers"
}
],
"title": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
}
]
}

View File

@ -1168,10 +1168,10 @@ public class XmlRecordFactory implements Serializable {
.asXmlElement(
"distributionlocation", instance.getDistributionlocation()));
}
if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) {
if (instance.getRefereed() != null && !instance.getRefereed().isBlank()) {
fields
.add(
XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue()));
XmlSerializationUtils.mapQualifier("refereed", instance.getRefereed()));
}
if (instance.getProcessingchargeamount() != null
&& isNotBlank(instance.getProcessingchargeamount().getValue())) {