From 4ff8007518dd5c9ce7b56ae95ce171dcbfd8b47a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 30 Jul 2020 16:24:39 +0200 Subject: [PATCH 1/2] added function to set the missing vocabulary names, used in the cleaning workflow as a pre-cleaning step --- .../dhp/schema/common/ModelConstants.java | 1 + .../oa/graph/clean/CleanGraphSparkJob.java | 60 +++++++++++++++++++ .../oa/graph/clean/CleaningFunctionTest.java | 27 +++++++-- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 +++++++ 4 files changed, 105 insertions(+), 5 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index bf48605d2..b27fc9267 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; public class ModelConstants { + public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies"; public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource"; public static final String DNET_ACCESS_MODES = "dnet:access_modes"; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 7091d9740..524cd7975 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -90,6 +90,7 @@ public class CleanGraphSparkJob { final CleaningRuleMap mapping = CleaningRuleMap.create(vocs); readTableFromPath(spark, inputPath, clazz) + .map((MapFunction) value -> fixVocabularyNames(value), Encoders.bean(clazz)) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) .map((MapFunction) value -> fixDefaults(value), Encoders.bean(clazz)) .write() @@ -98,6 +99,65 @@ public class CleanGraphSparkJob { .json(outputPath); } + protected static T fixVocabularyNames(T value) { + if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.nonNull(o.getCountry())) { + fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE); + } + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + + fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES); + fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE); + fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES); + + if (Objects.nonNull(r.getSubject())) { + r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES)); + } + if (Objects.nonNull(r.getInstance())) { + for (Instance i : r.getInstance()) { + fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES); + fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS); + } + } + if (Objects.nonNull(r.getAuthor())) { + r.getAuthor().forEach(a -> { + if (Objects.nonNull(a.getPid())) { + a.getPid().forEach(p -> { + fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES); + }); + } + }); + } + if (value instanceof Publication) { + + } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + + return value; + } + + private static void fixVocabName(Qualifier q, String vocabularyName) { + if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) { + q.setSchemeid(vocabularyName); + q.setSchemename(vocabularyName); + } + } + protected static T fixDefaults(T value) { if (value instanceof Datasource) { // nothing to clean here diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index 559a30b1e..1b597d4e8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -7,6 +7,8 @@ import static org.mockito.Mockito.lenient; import java.io.IOException; import java.util.List; import java.util.Set; +import java.util.function.Predicate; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.io.IOUtils; @@ -19,9 +21,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -62,7 +62,7 @@ public class CleaningFunctionTest { assertTrue(p_in instanceof Result); assertTrue(p_in instanceof Publication); - Publication p_out = OafCleaner.apply(p_in, mapping); + Publication p_out = OafCleaner.apply(CleanGraphSparkJob.fixVocabularyNames(p_in), mapping); assertNotNull(p_out); @@ -89,6 +89,15 @@ public class CleaningFunctionTest { Publication p_defaults = CleanGraphSparkJob.fixDefaults(p_out); assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid()); + getAuthorPids(p_defaults).forEach(pid -> { + System.out + .println( + String + .format( + "%s [%s - %s]", pid.getValue(), pid.getQualifier().getClassid(), + pid.getQualifier().getClassname())); + }); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_out)); @@ -97,7 +106,7 @@ public class CleaningFunctionTest { */ } - private Stream getAuthorPidTypes(Publication pub) { + private Stream getAuthorPidTypes(Result pub) { return pub .getAuthor() .stream() @@ -106,6 +115,14 @@ public class CleaningFunctionTest { .map(s -> s.getQualifier()); } + private Stream getAuthorPids(Result pub) { + return pub + .getAuthor() + .stream() + .map(a -> a.getPid()) + .flatMap(p -> p.stream()); + } + private List vocs() throws IOException { return IOUtils .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 5d0c0d1ed..67e690fae 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -59,6 +59,28 @@ "schemename": "dnet:pid_types" }, "value": "qwerty" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "ORCID", + "classname": "ORCID", + "schemeid": "", + "schemename": "" + }, + "value": "asdasd" } ], "rank": 2, From cd631bb5bc67ddf78a2ed7e1c5492b30eecd5444 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 30 Jul 2020 17:03:53 +0200 Subject: [PATCH 2/2] defaults fixed in the cleaning workflow forces result.publisher to NULL when result.publisher.value in empty --- .../java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java | 3 +++ .../eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java | 3 +++ .../test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json | 3 +++ 3 files changed, 9 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 524cd7975..ae1b37906 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -173,6 +173,9 @@ public class CleanGraphSparkJob { } else if (value instanceof Result) { Result r = (Result) value; + if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { + r.setPublisher(null); + } if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { r .setLanguage( diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index 1b597d4e8..e1ef847c3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -66,6 +66,8 @@ public class CleaningFunctionTest { assertNotNull(p_out); + assertNotNull(p_out.getPublisher()); + assertNull(p_out.getPublisher().getValue()); assertEquals("und", p_out.getLanguage().getClassid()); assertEquals("Undetermined", p_out.getLanguage().getClassname()); @@ -88,6 +90,7 @@ public class CleaningFunctionTest { Publication p_defaults = CleanGraphSparkJob.fixDefaults(p_out); assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid()); + assertNull(p_out.getPublisher()); getAuthorPids(p_defaults).forEach(pid -> { System.out diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 67e690fae..f51eed067 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -208,6 +208,9 @@ } ], "bestaccessright": null, + "publisher": { + "value": null + }, "collectedfrom": [ { "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",