From 32cee1f619eb30d2e2ac6083435b76b1aba7db09 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 5 Aug 2022 12:32:08 +0200 Subject: [PATCH] WIP: cleaning of subjects --- .../dhp/common/vocabulary/Vocabulary.java | 4 ++-- .../common/vocabulary/VocabularyGroup.java | 6 +++--- .../dhp/oa/graph/clean/CleaningRuleMap.java | 8 ++++---- .../clean/GraphCleaningFunctionsTest.java | 11 ++++++++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 20 +++++++++---------- .../dnetlib/dhp/oa/graph/clean/synonyms.txt | 4 +++- .../eu/dnetlib/dhp/oa/graph/clean/terms.txt | 5 ++++- 7 files changed, 37 insertions(+), 21 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java index 24a30500d..3a8df5c9e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java @@ -85,8 +85,8 @@ public class Vocabulary implements Serializable { public Qualifier lookup(String id) { return Optional - .ofNullable(getSynonymAsQualifier(id)) - .orElse(getTermAsQualifier(id)); + .ofNullable(getSynonymAsQualifier(id)) + .orElse(getTermAsQualifier(id)); } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index 8435b8bf3..fc7175270 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -83,9 +83,9 @@ public class VocabularyGroup implements Serializable { public Optional find(final String vocId) { return Optional - .ofNullable(vocId) - .map(String::toLowerCase) - .map(vocs::get); + .ofNullable(vocId) + .map(String::toLowerCase) + .map(vocs::get); } public void addTerm(final String vocId, final String id, final String name) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 2a4183f1b..894d5d059 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -4,15 +4,15 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; -import eu.dnetlib.dhp.common.vocabulary.Vocabulary; -import eu.dnetlib.dhp.schema.oaf.*; +import javax.jws.WebParam; + import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; +import eu.dnetlib.dhp.common.vocabulary.Vocabulary; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; - -import javax.jws.WebParam; +import eu.dnetlib.dhp.schema.oaf.*; public class CleaningRuleMap extends HashMap, SerializableConsumer> implements Serializable { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 7c39efb40..f4c4581b1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -251,6 +251,17 @@ public class GraphCleaningFunctionsTest { pid.getQualifier().getClassname())); }); + assertNotNull(p_cleaned.getSubject()); + + List fos_subjects = p_cleaned + .getSubject() + .stream() + .filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())) + .collect(Collectors.toList()); + + assertNotNull(fos_subjects); + assertEquals(2, fos_subjects.size()); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_cleaned)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 78fdc4c9d..ea63bba28 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -743,12 +743,12 @@ "trust": "0.9" }, "qualifier": { - "classid": "", - "classname": "", - "schemeid": "", - "schemename": "" + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" }, - "value": "infrared detectors" + "value": "FOS: Mathematics" }, { "dataInfo": { @@ -765,12 +765,12 @@ "trust": "0.9" }, "qualifier": { - "classid": "", - "classname": "", - "schemeid": "", - "schemename": "" + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" }, - "value": "lens antennas" + "value": "FOS: Computer and information sciences" }, { "dataInfo": { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt index 39ed0cef1..409dfd5dc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt @@ -1243,4 +1243,6 @@ dnet:relation_relClass @=@ IsSupplementTo @=@ isSupplementTo dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo dnet:relation_subRelType @=@ relationship @=@ publicationDataset -dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned \ No newline at end of file +dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned +FOS @=@ 0101 mathematics @=@ FOS: Mathematics +FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt index 59311d5a7..83ca81670 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt @@ -1117,4 +1117,7 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ relationship @=@ relat dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement -dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version \ No newline at end of file +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version +FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics +FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences +FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences \ No newline at end of file