From cfc01f136e2dd10edd9ca006ded209de01c736ec Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 17 Nov 2020 12:27:06 +0100 Subject: [PATCH] PID filtering based on a blacklist --- .../dhp/oa/graph/clean/CleaningFunctions.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 4bcce8037..e9f783670 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -1,8 +1,10 @@ package eu.dnetlib.dhp.oa.graph.clean; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Objects; +import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; @@ -17,7 +19,13 @@ import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; - public static final String NONE = "none"; + + public static final Set PID_BLACKLIST = new HashSet<>(); + + static { + PID_BLACKLIST.add("none"); + PID_BLACKLIST.add("na"); + } public static T fixVocabularyNames(T value) { if (value instanceof Datasource) { @@ -114,7 +122,7 @@ public class CleaningFunctions { .stream() .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue()))) - .filter(sp -> !NONE.equalsIgnoreCase(sp.getValue().trim())) + .filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase())) .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .map(sp -> {