From e6e177dda0c06f877673b78baf25b3cc9cb390c3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 9 Dec 2021 13:57:53 +0100 Subject: [PATCH] vocabulary based cleaning considers also the term label when looking up for a synonym --- .../common/vocabulary/VocabularyGroup.java | 8 ++ .../clean/GraphCleaningFunctionsTest.java | 7 +- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 86 +++++++++++++++++++ 3 files changed, 99 insertions(+), 2 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index d5f57849c7..1c129ff9ce 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -57,9 +57,17 @@ public class VocabularyGroup implements Serializable { final String syn = arr[2].trim(); vocs.addSynonyms(vocId, termId, syn); + } } + // add the term names as synonyms + vocs.vocs.values().forEach(voc -> { + voc.getTerms().values().forEach(term -> { + voc.addSynonym(term.getName().toLowerCase(), term.getId()); + }); + }); + return vocs; } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 0264a62662..5ceb644c9e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -151,6 +151,9 @@ public class GraphCleaningFunctionsTest { assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); + assertEquals("0033", p_out.getInstance().get(1).getInstancetype().getClassid()); + assertEquals("Audiovisual", p_out.getInstance().get(1).getInstancetype().getClassname()); + assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid()); assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname()); @@ -164,7 +167,7 @@ public class GraphCleaningFunctionsTest { List poi = p_out.getInstance(); assertNotNull(poi); - assertEquals(1, poi.size()); + assertEquals(2, poi.size()); final Instance poii = poi.get(0); assertNotNull(poii); @@ -213,7 +216,7 @@ public class GraphCleaningFunctionsTest { final List pci = p_cleaned.getInstance(); assertNotNull(pci); - assertEquals(1, pci.size()); + assertEquals(2, pci.size()); final Instance pcii = pci.get(0); assertNotNull(pcii); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index b3e3024746..5b9e86c650 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -403,6 +403,92 @@ "http://juuli.fi/Record/0275158616", "http://dx.doi.org/10.1007/s109090161569x" ] + }, + { + "pid": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1002/s21010127267xy" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1008/abcd" + } + ], + "alternateIdentifier": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1009/qwerty" + } + ], + "accessright": { + "classid": "CLOSED", + "classname": "CLOSED", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "collectedfrom": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "dateofacceptance": { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "value": "2016-01-01" + }, + "distributionlocation": "", + "hostedby": { + "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", + "value": "VIRTA" + }, + "instancetype": { + "classid": "Audiovisual", + "classname": "Audiovisual", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "url": [ + "http://dx.doi.org/10.1002/s21010127267xy" + ] } ], "journal": {