From 6163ecbf63858f5a3b34cffa19b63fa59f851d9a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 11 Oct 2022 11:20:03 +0200 Subject: [PATCH 1/2] [cleaning] renamed parameters in wf action --- .../dhp/oa/graph/clean/oozie_app/workflow.xml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 19e1b2a02c..6435d51318 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -471,7 +471,7 @@ yarn cluster - Clean publications counmtry + Clean publication country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -489,7 +489,7 @@ --workingPath${workingDir}/working/publication --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} @@ -500,7 +500,7 @@ yarn cluster - Clean datasets Country + Clean dataset country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -518,7 +518,7 @@ --workingPath${workingDir}/working/dataset --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} @@ -529,7 +529,7 @@ yarn cluster - Clean otherresearchproducts country + Clean otherresearchproduct country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -547,7 +547,7 @@ --workingPath${workingDir}/working/otherresearchproduct --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} @@ -558,7 +558,7 @@ yarn cluster - Clean softwares country + Clean software country eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob dhp-graph-mapper-${projectVersion}.jar @@ -576,7 +576,7 @@ --workingPath${workingDir}/working/software --country${country} --verifyParam${verifyCountryParam} - --datasourcePath${workingDir}/working/hostedby + --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} From b47aaf4dd17b9446bd423637391fbe83aab80775 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Oct 2022 11:23:43 +0200 Subject: [PATCH 2/2] [cleaning] subjects declared as belonging to specific vocabularies whose values are not found in the vocab are set to type keyword --- .../oaf/utils/GraphCleaningFunctions.java | 2 +- .../dhp/oa/graph/clean/CleaningRuleMap.java | 37 ++++++++++--------- .../clean/country/CleanCountrySparkJob.java | 2 +- .../clean/GraphCleaningFunctionsTest.java | 10 +++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 +++++++++++ 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 775f228ebf..363f954234 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -211,7 +211,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .orElse(s.getValue()), Function.identity(), (s1, s2) -> Collections - .min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator()))) + .min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator()))) .values()); r.setSubject(subjects); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 147e266999..5f3b4e1caa 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; +import java.util.Objects; import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.lang3.SerializationUtils; @@ -10,6 +11,7 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; @@ -31,29 +33,30 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer modified = new AtomicReference<>(false); + vocabularies.find(vocabularyId).ifPresent(vocabulary -> { - if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { - return; - } - Qualifier newValue = vocabulary.lookup(subject.getValue()); - if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) { - subject.setValue(newValue.getClassid()); - subject.getQualifier().setClassid(vocabularyId); - subject.getQualifier().setClassname(vocabulary.getName()); - modified.set(true); + if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { + Qualifier newValue = vocabulary.lookup(subject.getValue()); + if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) { + subject.setValue(newValue.getClassid()); + subject.getQualifier().setClassid(vocabularyId); + subject.getQualifier().setClassname(vocabulary.getName()); + } + } else if (vocabularyId.equals(subject.getQualifier().getClassid())) { + Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue()); + VocabularyTerm term = vocabulary.getTerm(subject.getValue()); + if (Objects.isNull(syn) && Objects.isNull(term)) { + subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD); + subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD); + } } }); - return modified.get(); } private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java index cd77f342e2..45590f789e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/CleanCountrySparkJob.java @@ -43,7 +43,7 @@ public class CleanCountrySparkJob implements Serializable { String jsonConfiguration = IOUtils .toString( - CleanContextSparkJob.class + CleanCountrySparkJob.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 6c43da8328..4035307e5a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -278,6 +278,16 @@ public class GraphCleaningFunctionsTest { s -> "0102 computer and information sciences".equals(s.getValue()) & ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + List s1 = p_cleaned + .getSubject() + .stream() + .filter(s -> s.getValue().equals("In Situ Hybridization")) + .collect(Collectors.toList()); + assertNotNull(s1); + assertEquals(1, s1.size()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get(0).getQualifier().getClassid()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get(0).getQualifier().getClassname()); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_cleaned)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 8e4fc4545e..84ff35c086 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -706,6 +706,28 @@ "source": [ ], "subject": [ + { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:repository", + "classname": "sysimport:crosswalk:repository", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": false, + "trust": "0.9" + }, + "qualifier": { + "classid": "FOS", + "classname": "Fields of Science and Technology classification", + "schemeid": "dnet:result_subject", + "schemename": "dnet:result_subject" + }, + "value": "In Situ Hybridization" + }, { "dataInfo": { "deletedbyinference": false,