From b7c387c21f946adbc9da90ded95166205195edb0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 12 Aug 2022 15:09:16 +0200 Subject: [PATCH] cleaning of subjects: avoid duplicated subjects, prioritise collected vs inferred or other sources --- .../oaf/utils/GraphCleaningFunctions.java | 29 +++++++----- .../dhp/schema/oaf/utils/OafMapperUtils.java | 13 +++++- .../utils/SubjectProvenanceComparator.java | 46 +++++++++++++++++++ .../clean/GraphCleaningFunctionsTest.java | 6 ++- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 +++++++++ 5 files changed, 101 insertions(+), 15 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 151c536853..775f228ebf 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance; + import java.time.LocalDate; import java.time.ZoneId; import java.time.format.DateTimeFormatter; @@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; import com.google.common.collect.Sets; -import eu.dnetlib.dhp.common.vocabulary.Vocabulary; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; @@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); } if (Objects.nonNull(r.getSubject())) { - r - .setSubject( + List subjects = Lists + .newArrayList( r .getSubject() .stream() @@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .map(GraphCleaningFunctions::cleanValue) - .collect(Collectors.toList())); + .collect( + Collectors + .toMap( + s -> Optional + .ofNullable(s.getQualifier()) + .map(q -> q.getClassid() + s.getValue()) + .orElse(s.getValue()), + Function.identity(), + (s1, s2) -> Collections + .min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator()))) + .values()); + r.setSubject(subjects); } if (Objects.nonNull(r.getTitle())) { r @@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(p -> StringUtils.isNotBlank(p.getValue())) .map(p -> { // hack to distinguish orcid from orcid_pending - String pidProvenance = Optional - .ofNullable(p.getDataInfo()) - .map( - d -> Optional - .ofNullable(d.getProvenanceaction()) - .map(Qualifier::getClassid) - .orElse("")) - .orElse(""); + String pidProvenance = getProvenance(p.getDataInfo()); if (p .getQualifier() .getClassid() diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java index 6ba7d70f18..c58096d353 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -11,10 +11,10 @@ import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.schema.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; @@ -503,4 +503,15 @@ public class OafMapperUtils { rel.setProperties(properties); return rel; } + + public static String getProvenance(DataInfo dataInfo) { + return Optional + .ofNullable(dataInfo) + .map( + d -> Optional + .ofNullable(d.getProvenanceaction()) + .map(Qualifier::getClassid) + .orElse("")) + .orElse(""); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java new file mode 100644 index 0000000000..f4e3c8841c --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance; +import static org.apache.commons.lang3.StringUtils.isBlank; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.Subject; + +public class SubjectProvenanceComparator implements Comparator { + + @Override + public int compare(Subject left, Subject right) { + + String lProv = getProvenance(left.getDataInfo()); + String rProv = getProvenance(right.getDataInfo()); + + if (isBlank(lProv) && isBlank(rProv)) + return 0; + if (isBlank(lProv)) + return 1; + if (isBlank(rProv)) + return -1; + if (lProv.equals(rProv)) + return 0; + if (lProv.toLowerCase().contains("crosswalk")) + return -1; + if (rProv.toLowerCase().contains("crosswalk")) + return 1; + if (lProv.toLowerCase().contains("user")) + return -1; + if (rProv.toLowerCase().contains("user")) + return 1; + if (lProv.toLowerCase().contains("propagation")) + return -1; + if (rProv.toLowerCase().contains("propagation")) + return 1; + if (lProv.toLowerCase().contains("iis")) + return -1; + if (rProv.toLowerCase().contains("iis")) + return 1; + + return 0; + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 0289d62b44..c6222af147 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -260,14 +260,16 @@ public class GraphCleaningFunctionsTest { .collect(Collectors.toList()); assertNotNull(fos_subjects); - assertEquals(3, fos_subjects.size()); + assertEquals(2, fos_subjects.size()); assertTrue( fos_subjects .stream() .anyMatch( s -> "0101 mathematics".equals(s.getValue()) & - ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) & + "sysimport:crosswalk:datasetarchive".equals(s.getDataInfo().getProvenanceaction().getClassid()) + )); assertTrue( fos_subjects diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index a5ba747c40..8e4fc4545e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -794,6 +794,28 @@ }, "value": "0101 mathematics" }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "subject:fos", + "classname": "subject:fos", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "0101 mathematics" + }, { "dataInfo": { "deletedbyinference": false,