WIP: graph cleaner implementation

This commit is contained in:
Claudio Atzori 2020-06-09 19:52:53 +02:00
parent d9f33582c5
commit a2fdf85ba1
12 changed files with 131 additions and 45 deletions

View File

@ -14,6 +14,7 @@ public class ModelConstants {
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource"; public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
public static final String DNET_COUNTRY_TYPE = "dnet:countries"; public static final String DNET_COUNTRY_TYPE = "dnet:countries";
public static final String DNET_REVIEW_LEVELS = "dnet:review_levels";
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository"; public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry"; public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";

View File

@ -31,7 +31,7 @@ public class Instance implements Serializable {
// typed results // typed results
private Field<String> processingchargecurrency; private Field<String> processingchargecurrency;
private Field<String> refereed; // peer-review status private Qualifier refereed; // peer-review status
public Field<String> getLicense() { public Field<String> getLicense() {
return license; return license;
@ -113,11 +113,11 @@ public class Instance implements Serializable {
this.processingchargecurrency = processingchargecurrency; this.processingchargecurrency = processingchargecurrency;
} }
public Field<String> getRefereed() { public Qualifier getRefereed() {
return refereed; return refereed;
} }
public void setRefereed(Field<String> refereed) { public void setRefereed(Qualifier refereed) {
this.refereed = refereed; this.refereed = refereed;
} }

View File

@ -96,12 +96,21 @@ public class ProtoConverter implements Serializable {
.stream() .stream()
.distinct() .distinct()
.collect(Collectors.toCollection(ArrayList::new)) : null); .collect(Collectors.toCollection(ArrayList::new)) : null);
i.setRefereed(mapStringField(ri.getRefereed())); i.setRefereed(mapRefereed(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i; return i;
} }
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
Qualifier q = new Qualifier();
q.setClassid(refereed.getValue());
q.setSchemename(refereed.getValue());
q.setSchemeid("dnet:review_levels");
q.setSchemename("dnet:review_levels");
return q;
}
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) { private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
ResultProtos.Result r = oaf.getEntity().getResult(); ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getExternalReferenceCount() > 0) { if (r.getExternalReferenceCount() > 0) {

View File

@ -8,6 +8,7 @@ import java.io.File;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
@ -19,6 +20,7 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message; import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageManager;
@Disabled
public class DnetCollectorWorkerApplicationTests { public class DnetCollectorWorkerApplicationTests {
private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class);

View File

@ -166,8 +166,10 @@ case object Crossref2Oaf {
val has_review = (json \ "relation" \"has-review" \ "id") val has_review = (json \ "relation" \"has-review" \ "id")
if(has_review != JNothing) if(has_review != JNothing) {
instance.setRefereed(asField("peerReviewed")) instance.setRefereed(
createQualifier("0001", "peerReviewed", "dnet:review_levels", "dnet:review_levels"))
}
instance.setAccessright(getRestrictedQualifier()) instance.setAccessright(getRestrictedQualifier())

View File

@ -1,19 +1,16 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.oa.graph.clean;
import java.beans.BeanInfo;
import java.beans.IntrospectionException;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException; import java.util.Arrays;
import java.util.*; import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class CleaningRule<T extends Oaf> implements MapFunction<T, T> { public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
@ -61,7 +58,6 @@ public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
if (value instanceof Qualifier) { if (value instanceof Qualifier) {
Qualifier q = (Qualifier) value; Qualifier q = (Qualifier) value;
if (vocabularies.vocabularyExists(q.getSchemeid())) { if (vocabularies.vocabularyExists(q.getSchemeid())) {
field.set(o, vocabularies.lookup(q.getSchemeid(), q.getClassid())); field.set(o, vocabularies.lookup(q.getSchemeid(), q.getClassid()));
} }
@ -86,4 +82,8 @@ public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
return fields; return fields;
} }
public VocabularyGroup getVocabularies() {
return vocabularies;
}
} }

View File

@ -4,13 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -139,7 +133,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
instance instance
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
instance instance
.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); .setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance instance

View File

@ -4,19 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENTED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENT_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PART;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SUPPLEMENT;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -129,7 +117,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
instance instance
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));

View File

@ -2,10 +2,9 @@
package eu.dnetlib.dhp.oa.graph.raw.common; package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.*;
import java.util.Map;
import java.util.Optional;
import java.util.function.Supplier; import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -86,6 +85,19 @@ public class VocabularyGroup implements Serializable {
} }
} }
public Set<String> getTerms(String vocId) {
if (!vocabularyExists(vocId)) {
return new HashSet<>();
}
return vocs
.get(vocId.toLowerCase())
.getTerms()
.values()
.stream()
.map(t -> t.getId())
.collect(Collectors.toCollection(HashSet::new));
}
public Qualifier lookup(String vocId, String id) { public Qualifier lookup(String vocId, String id) {
return Optional return Optional
.ofNullable(getSynonymAsQualifier(vocId, id)) .ofNullable(getSynonymAsQualifier(vocId, id))

View File

@ -1,14 +1,17 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.oa.graph.clean;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtendWith;
@ -18,7 +21,9 @@ import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -26,6 +31,7 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class CleaningRuleTest { public class CleaningRuleTest {
public static final ObjectMapper MAPPER = new ObjectMapper(); public static final ObjectMapper MAPPER = new ObjectMapper();
@Mock @Mock
private ISLookUpService isLookUpService; private ISLookUpService isLookUpService;
@ -47,18 +53,46 @@ public class CleaningRuleTest {
@Test @Test
public void testCleaning() throws Exception { public void testCleaning() throws Exception {
assertNotNull(cleaningRule.getVocabularies());
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")); String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = cleaningRule.call(p_in); Publication p_out = cleaningRule.call(p_in);
Assertions.assertNotNull(p_out); assertNotNull(p_out);
assertEquals("eng", p_out.getLanguage().getClassid());
assertEquals("English", p_out.getLanguage().getClassname());
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
Set<String> pidTerms = vocabularies.getTerms("dnet:pid_types");
assertTrue(
p_out
.getPid()
.stream()
.map(p -> p.getQualifier())
.allMatch(q -> pidTerms.contains(q.getClassid())));
// TODO add more assertions to verity the cleaned values // TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_out)); System.out.println(MAPPER.writeValueAsString(p_out));
} }
private Stream<Qualifier> getAuthorPidTypes(Publication pub) {
return pub
.getAuthor()
.stream()
.map(a -> a.getPid())
.flatMap(p -> p.stream())
.map(s -> s.getQualifier());
}
private List<String> vocs() throws IOException { private List<String> vocs() throws IOException {
return IOUtils return IOUtils
.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); .readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));

View File

@ -6,6 +6,28 @@
"fullname": "Brien, Tom", "fullname": "Brien, Tom",
"name": "Tom", "name": "Tom",
"pid": [ "pid": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "ORCID12",
"classname": "ORCID12",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "0000-0001-9613-6639"
}
], ],
"rank": 1, "rank": 1,
"surname": "Brien" "surname": "Brien"
@ -16,6 +38,28 @@
"fullname": "Ade, Peter", "fullname": "Ade, Peter",
"name": "Peter", "name": "Peter",
"pid": [ "pid": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "xyz",
"classname": "XYZ",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "qwerty"
}
], ],
"rank": 2, "rank": 2,
"surname": "Ade" "surname": "Ade"
@ -207,7 +251,7 @@
{ {
"accessright": { "accessright": {
"classid": "CLOSED", "classid": "CLOSED",
"classname": "Closed Access", "classname": "CLOSED",
"schemeid": "dnet:access_modes", "schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes" "schemename": "dnet:access_modes"
}, },

View File

@ -1165,10 +1165,10 @@ public class XmlRecordFactory implements Serializable {
.asXmlElement( .asXmlElement(
"distributionlocation", instance.getDistributionlocation())); "distributionlocation", instance.getDistributionlocation()));
} }
if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { if (instance.getRefereed() != null && !instance.getRefereed().isBlank()) {
fields fields
.add( .add(
XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue())); XmlSerializationUtils.mapQualifier("refereed", instance.getRefereed()));
} }
if (instance.getProcessingchargeamount() != null if (instance.getProcessingchargeamount() != null
&& isNotBlank(instance.getProcessingchargeamount().getValue())) { && isNotBlank(instance.getProcessingchargeamount().getValue())) {