From eeebd5a9209bc129c53ed1bcba8d745e0d185f4f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 24 Nov 2020 18:40:25 +0100 Subject: [PATCH] Cleanig workflow: remove newlines from titles, descriptions, subjects --- .../dhp/oa/graph/clean/CleaningFunctions.java | 67 ++++++++++++++++--- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index e9f783670d..3d8ce6dcb2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -1,10 +1,7 @@ package eu.dnetlib.dhp.oa.graph.clean; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.Objects; -import java.util.Set; +import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; @@ -18,7 +15,9 @@ import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { + public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; + public static final String NEWLINES = "(?:\\n|\\r)"; public static final Set PID_BLACKLIST = new HashSet<>(); @@ -79,7 +78,7 @@ public class CleaningFunctions { return value; } - protected static T fixDefaults(T value) { + public static T fixDefaults(T value) { if (value instanceof Datasource) { // nothing to clean here } else if (value instanceof Project) { @@ -112,6 +111,29 @@ public class CleaningFunctions { .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(CleaningFunctions::removeNewLines) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getTitle())) { + r + .setTitle( + r + .getTitle() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .map(CleaningFunctions::removeNewLines) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getDescription())) { + r + .setDescription( + r + .getDescription() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .map(CleaningFunctions::removeNewLines) .collect(Collectors.toList())); } if (Objects.nonNull(r.getPid())) { @@ -125,10 +147,7 @@ public class CleaningFunctions { .filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase())) .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) - .map(sp -> { - sp.setValue(StringUtils.trim(sp.getValue())); - return sp; - }) + .map(CleaningFunctions::normalizePidValue) .collect(Collectors.toList())); } if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { @@ -211,6 +230,16 @@ public class CleaningFunctions { return value; } + protected static StructuredProperty removeNewLines(StructuredProperty s) { + s.setValue(s.getValue().replaceAll(NEWLINES, " ")); + return s; + } + + protected static Field removeNewLines(Field s) { + s.setValue(s.getValue().replaceAll(NEWLINES, " ")); + return s; + } + // HELPERS private static void fixVocabName(Qualifier q, String vocabularyName) { @@ -226,4 +255,24 @@ public class CleaningFunctions { classid, classname, scheme, scheme); } + /** + * Utility method that normalises PID values on a per-type basis. + * @param pid the PID whose value will be normalised. + * @return the PID containing the normalised value. + */ + public static StructuredProperty normalizePidValue(StructuredProperty pid) { + String value = Optional + .ofNullable(pid.getValue()) + .map(String::trim) + .orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty")); + switch (pid.getQualifier().getClassid()) { + + // TODO add cleaning for more PID types as needed + case "doi": + pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, "")); + break; + } + return pid; + } + }