From b47aaf4dd17b9446bd423637391fbe83aab80775 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Oct 2022 11:23:43 +0200 Subject: [PATCH] [cleaning] subjects declared as belonging to specific vocabularies whose values are not found in the vocab are set to type keyword --- .../oaf/utils/GraphCleaningFunctions.java | 2 +- .../dhp/oa/graph/clean/CleaningRuleMap.java | 37 ++++++++++--------- .../clean/country/CleanCountrySparkJob.java | 2 +- .../clean/GraphCleaningFunctionsTest.java | 10 +++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 +++++++++++ 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 775f228eb..363f95423 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -211,7 +211,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .orElse(s.getValue()), Function.identity(), (s1, s2) -> Collections - .min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator()))) + .min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator()))) .values()); r.setSubject(subjects); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 147e26699..5f3b4e1ca 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; +import java.util.Objects; import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.lang3.SerializationUtils; @@ -10,6 +11,7 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; @@ -31,29 +33,30 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer modified = new AtomicReference<>(false); + vocabularies.find(vocabularyId).ifPresent(vocabulary -> { - if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { - return; - } - Qualifier newValue = vocabulary.lookup(subject.getValue()); - if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) { - subject.setValue(newValue.getClassid()); - subject.getQualifier().setClassid(vocabularyId); - subject.getQualifier().setClassname(vocabulary.getName()); - modified.set(true); + if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { + Qualifier newValue = vocabulary.lookup(subject.getValue()); + if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) { + subject.setValue(newValue.getClassid()); + subject.getQualifier().setClassid(vocabularyId); + subject.getQualifier().setClassname(vocabulary.getName()); + } + } else if (vocabularyId.equals(subject.getQualifier().getClassid())) { + Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue()); + VocabularyTerm term = vocabulary.getTerm(subject.getValue()); + if (Objects.isNull(syn) && Objects.isNull(term)) { + subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD); + subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD); + } } }); - return modified.get(); } private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java index cd77f342e..45590f789 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java @@ -43,7 +43,7 @@ public class CleanCountrySparkJob implements Serializable { String jsonConfiguration = IOUtils .toString( - CleanContextSparkJob.class + CleanCountrySparkJob.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 6c43da832..4035307e5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -278,6 +278,16 @@ public class GraphCleaningFunctionsTest { s -> "0102 computer and information sciences".equals(s.getValue()) & ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + List s1 = p_cleaned + .getSubject() + .stream() + .filter(s -> s.getValue().equals("In Situ Hybridization")) + .collect(Collectors.toList()); + assertNotNull(s1); + assertEquals(1, s1.size()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get(0).getQualifier().getClassid()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get(0).getQualifier().getClassname()); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_cleaned)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 8e4fc4545..84ff35c08 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -706,6 +706,28 @@ "source": [ ], "subject": [ + { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:repository", + "classname": "sysimport:crosswalk:repository", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": false, + "trust": "0.9" + }, + "qualifier": { + "classid": "FOS", + "classname": "Fields of Science and Technology classification", + "schemeid": "dnet:result_subject", + "schemename": "dnet:result_subject" + }, + "value": "In Situ Hybridization" + }, { "dataInfo": { "deletedbyinference": false,