From 41c70c607dc7fd8786da3711a52477907e5a4d1a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 9 Dec 2021 16:44:28 +0100 Subject: [PATCH] cleaning workflow assigns the proper default instance type when a value could not be cleaned using the vocabularies --- .../oaf/utils/GraphCleaningFunctions.java | 36 +++++++- .../oa/graph/clean/CleanGraphSparkJob.java | 2 +- .../clean/GraphCleaningFunctionsTest.java | 20 +++-- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 90 ++++++++++++++++++- 5 files changed, 138 insertions(+), 12 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index f5781390a..42e19628c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -16,6 +16,8 @@ import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; import com.google.common.collect.Sets; +import eu.dnetlib.dhp.common.vocabulary.Vocabulary; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; @@ -131,7 +133,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { return true; } - public static T cleanup(T value) { + public static T cleanup(T value, VocabularyGroup vocs) { if (value instanceof Datasource) { // nothing to clean here } else if (value instanceof Project) { @@ -250,6 +252,38 @@ public class GraphCleaningFunctions extends CleaningFunctions { if (Objects.nonNull(r.getInstance())) { for (Instance i : r.getInstance()) { + if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) { + if (r instanceof Publication) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof Dataset) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof Software) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof OtherResearchProduct) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } + } + if (Objects.nonNull(i.getPid())) { i.setPid(processPidCleaning(i.getPid())); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index d43d7ce28..2e2ea567a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -88,7 +88,7 @@ public class CleanGraphSparkJob { readTableFromPath(spark, inputPath, clazz) .map((MapFunction) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz)) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) - .map((MapFunction) GraphCleaningFunctions::cleanup, Encoders.bean(clazz)) + .map((MapFunction) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz)) .filter((FilterFunction) GraphCleaningFunctions::filter) .write() .mode(SaveMode.Overwrite) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 5ceb644c9..a7c7eb810 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -151,8 +151,11 @@ public class GraphCleaningFunctionsTest { assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); - assertEquals("0033", p_out.getInstance().get(1).getInstancetype().getClassid()); - assertEquals("Audiovisual", p_out.getInstance().get(1).getInstancetype().getClassname()); + assertEquals("0027", p_out.getInstance().get(1).getInstancetype().getClassid()); + assertEquals("Model", p_out.getInstance().get(1).getInstancetype().getClassname()); + + assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassid()); + assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassname()); assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid()); assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname()); @@ -167,7 +170,7 @@ public class GraphCleaningFunctionsTest { List poi = p_out.getInstance(); assertNotNull(poi); - assertEquals(2, poi.size()); + assertEquals(3, poi.size()); final Instance poii = poi.get(0); assertNotNull(poii); @@ -195,7 +198,7 @@ public class GraphCleaningFunctionsTest { assertEquals(5, p_out.getTitle().size()); - Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out); + Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies); assertEquals(3, p_cleaned.getTitle().size()); @@ -214,9 +217,12 @@ public class GraphCleaningFunctionsTest { assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue()); + assertEquals("0038", p_cleaned.getInstance().get(2).getInstancetype().getClassid()); + assertEquals("Other literature type", p_cleaned.getInstance().get(2).getInstancetype().getClassname()); + final List pci = p_cleaned.getInstance(); assertNotNull(pci); - assertEquals(2, pci.size()); + assertEquals(3, pci.size()); final Instance pcii = pci.get(0); assertNotNull(pcii); @@ -284,7 +290,7 @@ public class GraphCleaningFunctionsTest { .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json")); Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); - Publication cleaned = GraphCleaningFunctions.cleanup(p_out); + Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies); Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned)); } @@ -295,7 +301,7 @@ public class GraphCleaningFunctionsTest { .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json")); Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); - Publication cleaned = GraphCleaningFunctions.cleanup(p_out); + Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies); Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned)); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 27e33bf27..de79b750a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -708,7 +708,7 @@ class MappersTest { assertEquals(1, p.getTitle().size()); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); - final Publication p_cleaned = cleanup(fixVocabularyNames(p)); + final Publication p_cleaned = cleanup(fixVocabularyNames(p), vocs); assertNotNull(p_cleaned.getTitle()); assertFalse(p_cleaned.getTitle().isEmpty()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 5b9e86c65..78fdc4c9d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -481,14 +481,100 @@ "value": "VIRTA" }, "instancetype": { - "classid": "Audiovisual", - "classname": "Audiovisual", + "classid": "Model", + "classname": "Model", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource" }, "url": [ "http://dx.doi.org/10.1002/s21010127267xy" ] + }, + { + "pid": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1002/s21010127267xy" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1008/abcd" + } + ], + "alternateIdentifier": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1009/qwerty" + } + ], + "accessright": { + "classid": "CLOSED", + "classname": "CLOSED", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "collectedfrom": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "value": "2016-01-01" + }, + "distributionlocation": "", + "hostedby": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "instancetype": { + "classid": "xyz", + "classname": "xyz", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "url": [ + "http://dx.doi.org/10.1002/t32121238378t" + ] } ], "journal": {