From 2e9e13444d2c4933c2e012a084e052e9ad6e24c4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 1 Oct 2020 12:50:40 +0200 Subject: [PATCH] author pids made unique by value --- .../dhp/oa/graph/clean/CleaningFunctions.java | 24 ++++++++++++++++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 25 +++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 3a0eace1f..f615d69f2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -1,9 +1,12 @@ package eu.dnetlib.dhp.oa.graph.clean; +import java.util.LinkedHashMap; import java.util.Objects; +import java.util.function.Function; import java.util.stream.Collectors; +import com.clearspring.analytics.util.Lists; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; @@ -13,6 +16,8 @@ import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { + public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; + public static T fixVocabularyNames(T value) { if (value instanceof Datasource) { // nothing to clean here @@ -139,6 +144,25 @@ public class CleaningFunctions { author.setRank(i++); } } + for(Author a : r.getAuthor()) { + if (Objects.isNull(a.getPid())) { + a.setPid(Lists.newArrayList()); + } else { + a.setPid( + a.getPid().stream() + .filter(p -> Objects.nonNull(p.getQualifier())) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .map(p -> { + p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); + return p; + }) + .collect(Collectors.toMap(StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, LinkedHashMap::new)) + .values() + .stream() + .collect(Collectors.toList())); + } + } + } if (value instanceof Publication) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index f51eed067..5c903cd0e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -27,6 +27,28 @@ "schemename": "dnet:pid_types" }, "value": "0000-0001-9613-6639" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "ORCID12", + "classname": "ORCID12", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "https://orcid.org/0000-0001-9613-6639" } ], "rank": 1, @@ -91,8 +113,7 @@ ], "fullname": "Barry, Peter S.", "name": "Peter S.", - "pid": [ - ], + "pid": null, "rank": 3, "surname": "Barry" },