forked from antonis.lempesis/dnet-hadoop
WIP: graph cleaner implementation
This commit is contained in:
parent
d9f33582c5
commit
a2fdf85ba1
|
@ -14,6 +14,7 @@ public class ModelConstants {
|
|||
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
|
||||
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
||||
public static final String DNET_COUNTRY_TYPE = "dnet:countries";
|
||||
public static final String DNET_REVIEW_LEVELS = "dnet:review_levels";
|
||||
|
||||
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
|
||||
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
|
||||
|
|
|
@ -31,7 +31,7 @@ public class Instance implements Serializable {
|
|||
// typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private Field<String> refereed; // peer-review status
|
||||
private Qualifier refereed; // peer-review status
|
||||
|
||||
public Field<String> getLicense() {
|
||||
return license;
|
||||
|
@ -113,11 +113,11 @@ public class Instance implements Serializable {
|
|||
this.processingchargecurrency = processingchargecurrency;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
public Qualifier getRefereed() {
|
||||
return refereed;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
public void setRefereed(Qualifier refereed) {
|
||||
this.refereed = refereed;
|
||||
}
|
||||
|
||||
|
|
|
@ -96,12 +96,21 @@ public class ProtoConverter implements Serializable {
|
|||
.stream()
|
||||
.distinct()
|
||||
.collect(Collectors.toCollection(ArrayList::new)) : null);
|
||||
i.setRefereed(mapStringField(ri.getRefereed()));
|
||||
i.setRefereed(mapRefereed(ri.getRefereed()));
|
||||
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
|
||||
return i;
|
||||
}
|
||||
|
||||
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
|
||||
Qualifier q = new Qualifier();
|
||||
q.setClassid(refereed.getValue());
|
||||
q.setSchemename(refereed.getValue());
|
||||
q.setSchemeid("dnet:review_levels");
|
||||
q.setSchemename("dnet:review_levels");
|
||||
return q;
|
||||
}
|
||||
|
||||
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result r = oaf.getEntity().getResult();
|
||||
if (r.getExternalReferenceCount() > 0) {
|
||||
|
|
|
@ -8,6 +8,7 @@ import java.io.File;
|
|||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
@ -19,6 +20,7 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
|||
import eu.dnetlib.message.Message;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
|
||||
@Disabled
|
||||
public class DnetCollectorWorkerApplicationTests {
|
||||
|
||||
private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class);
|
||||
|
|
|
@ -166,8 +166,10 @@ case object Crossref2Oaf {
|
|||
|
||||
val has_review = (json \ "relation" \"has-review" \ "id")
|
||||
|
||||
if(has_review != JNothing)
|
||||
instance.setRefereed(asField("peerReviewed"))
|
||||
if(has_review != JNothing) {
|
||||
instance.setRefereed(
|
||||
createQualifier("0001", "peerReviewed", "dnet:review_levels", "dnet:review_levels"))
|
||||
}
|
||||
|
||||
|
||||
instance.setAccessright(getRestrictedQualifier())
|
||||
|
|
|
@ -1,19 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import java.beans.BeanInfo;
|
||||
import java.beans.IntrospectionException;
|
||||
import java.beans.Introspector;
|
||||
import java.beans.PropertyDescriptor;
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
|
||||
public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
|
||||
|
@ -61,7 +58,6 @@ public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
|
|||
if (value instanceof Qualifier) {
|
||||
Qualifier q = (Qualifier) value;
|
||||
if (vocabularies.vocabularyExists(q.getSchemeid())) {
|
||||
|
||||
field.set(o, vocabularies.lookup(q.getSchemeid(), q.getClassid()));
|
||||
}
|
||||
|
||||
|
@ -86,4 +82,8 @@ public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
|
|||
|
||||
return fields;
|
||||
}
|
||||
|
||||
public VocabularyGroup getVocabularies() {
|
||||
return vocabularies;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,13 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
|||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -139,7 +133,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
instance
|
||||
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
|
||||
instance
|
||||
.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance
|
||||
|
|
|
@ -4,19 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
|||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENTED_BY;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENT_TO;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PART;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.SUPPLEMENT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -129,7 +117,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
instance
|
||||
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
|
||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance
|
||||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
|
|
|
@ -2,10 +2,9 @@
|
|||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.*;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
|
@ -86,6 +85,19 @@ public class VocabularyGroup implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public Set<String> getTerms(String vocId) {
|
||||
if (!vocabularyExists(vocId)) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return vocs
|
||||
.get(vocId.toLowerCase())
|
||||
.getTerms()
|
||||
.values()
|
||||
.stream()
|
||||
.map(t -> t.getId())
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
public Qualifier lookup(String vocId, String id) {
|
||||
return Optional
|
||||
.ofNullable(getSynonymAsQualifier(vocId, id))
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
|
@ -18,7 +21,9 @@ import org.mockito.junit.jupiter.MockitoExtension;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
|
@ -26,6 +31,7 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|||
public class CleaningRuleTest {
|
||||
|
||||
public static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
@Mock
|
||||
private ISLookUpService isLookUpService;
|
||||
|
||||
|
@ -47,18 +53,46 @@ public class CleaningRuleTest {
|
|||
@Test
|
||||
public void testCleaning() throws Exception {
|
||||
|
||||
assertNotNull(cleaningRule.getVocabularies());
|
||||
|
||||
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
|
||||
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||
|
||||
Publication p_out = cleaningRule.call(p_in);
|
||||
|
||||
Assertions.assertNotNull(p_out);
|
||||
assertNotNull(p_out);
|
||||
|
||||
assertEquals("eng", p_out.getLanguage().getClassid());
|
||||
assertEquals("English", p_out.getLanguage().getClassname());
|
||||
|
||||
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
|
||||
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
|
||||
|
||||
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
|
||||
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
|
||||
|
||||
Set<String> pidTerms = vocabularies.getTerms("dnet:pid_types");
|
||||
assertTrue(
|
||||
p_out
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(p -> p.getQualifier())
|
||||
.allMatch(q -> pidTerms.contains(q.getClassid())));
|
||||
|
||||
// TODO add more assertions to verity the cleaned values
|
||||
System.out.println(MAPPER.writeValueAsString(p_out));
|
||||
|
||||
}
|
||||
|
||||
private Stream<Qualifier> getAuthorPidTypes(Publication pub) {
|
||||
return pub
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.map(a -> a.getPid())
|
||||
.flatMap(p -> p.stream())
|
||||
.map(s -> s.getQualifier());
|
||||
}
|
||||
|
||||
private List<String> vocs() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
|
||||
|
|
|
@ -6,6 +6,28 @@
|
|||
"fullname": "Brien, Tom",
|
||||
"name": "Tom",
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "ORCID12",
|
||||
"classname": "ORCID12",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "0000-0001-9613-6639"
|
||||
}
|
||||
],
|
||||
"rank": 1,
|
||||
"surname": "Brien"
|
||||
|
@ -16,6 +38,28 @@
|
|||
"fullname": "Ade, Peter",
|
||||
"name": "Peter",
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "xyz",
|
||||
"classname": "XYZ",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "qwerty"
|
||||
}
|
||||
],
|
||||
"rank": 2,
|
||||
"surname": "Ade"
|
||||
|
@ -207,7 +251,7 @@
|
|||
{
|
||||
"accessright": {
|
||||
"classid": "CLOSED",
|
||||
"classname": "Closed Access",
|
||||
"classname": "CLOSED",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
|
|
|
@ -1165,10 +1165,10 @@ public class XmlRecordFactory implements Serializable {
|
|||
.asXmlElement(
|
||||
"distributionlocation", instance.getDistributionlocation()));
|
||||
}
|
||||
if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) {
|
||||
if (instance.getRefereed() != null && !instance.getRefereed().isBlank()) {
|
||||
fields
|
||||
.add(
|
||||
XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue()));
|
||||
XmlSerializationUtils.mapQualifier("refereed", instance.getRefereed()));
|
||||
}
|
||||
if (instance.getProcessingchargeamount() != null
|
||||
&& isNotBlank(instance.getProcessingchargeamount().getValue())) {
|
||||
|
|
Loading…
Reference in New Issue