From e1a1bb3ee4f91223d71b76b22170999cec210ed9 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 24 Nov 2020 18:34:03 +0100 Subject: [PATCH] moved class CleaningFunctions in the correct package. Remove newlines from titles, descriptions, subjects --- .../dhp/schema/oaf/CleaningFunctions.java | 39 +++++++++++++++++-- .../schema/oaf/utils/IdentifierFactory.java | 3 +- .../schema/oaf/utils/PidValueComparator.java | 7 +--- .../oa/graph/clean/CleanGraphSparkJob.java | 4 +- .../dhp/oa/graph/raw/OafToOafMapper.java | 2 +- .../dhp/oa/graph/raw/OdfToOafMapper.java | 1 - .../oa/graph/clean/CleaningFunctionTest.java | 7 +--- 7 files changed, 43 insertions(+), 20 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 390af6a97..0fae82a8a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.clean; +package eu.dnetlib.dhp.schema.oaf; import java.util.*; import java.util.function.Function; @@ -10,12 +10,12 @@ import org.apache.commons.lang3.StringUtils; import com.clearspring.analytics.util.Lists; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; + public static final String NEWLINES = "(?:\\n|\\r)"; public static final Set PID_BLACKLIST = new HashSet<>(); @@ -76,7 +76,7 @@ public class CleaningFunctions { return value; } - protected static T fixDefaults(T value) { + public static T cleanup(T value) { if (value instanceof Datasource) { // nothing to clean here } else if (value instanceof Project) { @@ -109,6 +109,29 @@ public class CleaningFunctions { .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(CleaningFunctions::removeNewLines) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getTitle())) { + r + .setTitle( + r + .getTitle() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .map(CleaningFunctions::removeNewLines) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getDescription())) { + r + .setDescription( + r + .getDescription() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .map(CleaningFunctions::removeNewLines) .collect(Collectors.toList())); } if (Objects.nonNull(r.getPid())) { @@ -205,6 +228,16 @@ public class CleaningFunctions { return value; } + protected static StructuredProperty removeNewLines(StructuredProperty s) { + s.setValue(s.getValue().replaceAll(NEWLINES, " ")); + return s; + } + + protected static Field removeNewLines(Field s) { + s.setValue(s.getValue().replaceAll(NEWLINES, " ")); + return s; + } + // HELPERS private static void fixVocabName(Qualifier q, String vocabularyName) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index 6090b8e2f..01bb92bf6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -7,11 +7,10 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.commons.lang.StringUtils; -import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions; +import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java index 087bbc121..7e53ba9b7 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java @@ -4,12 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils; import java.util.Comparator; import java.util.Optional; -import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.*; public class PidValueComparator implements Comparator { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 2dd49345c..04a5ef38d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.graph.clean; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.oaf.CleaningFunctions.*; import java.util.Optional; @@ -26,7 +27,6 @@ import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import static eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions.*; public class CleanGraphSparkJob { @@ -89,7 +89,7 @@ public class CleanGraphSparkJob { readTableFromPath(spark, inputPath, clazz) .map((MapFunction) value -> fixVocabularyNames(value), Encoders.bean(clazz)) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) - .map((MapFunction) value -> fixDefaults(value), Encoders.bean(clazz)) + .map((MapFunction) value -> cleanup(value), Encoders.bean(clazz)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index af1a9aec6..e28e8bd3c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -16,9 +16,9 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; -import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; public class OafToOafMapper extends AbstractMdRecordToOafMapper { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 25ff4ae88..6ceaa405a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -12,7 +12,6 @@ import org.dom4j.Document; import org.dom4j.Node; import eu.dnetlib.dhp.common.PacePerson; -import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index cb34b0cb3..e38101f82 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -19,10 +19,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -89,7 +86,7 @@ public class CleaningFunctionTest { .map(p -> p.getQualifier()) .allMatch(q -> pidTerms.contains(q.getClassid()))); - Publication p_defaults = CleaningFunctions.fixDefaults(p_out); + Publication p_defaults = CleaningFunctions.cleanup(p_out); assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid()); assertNull(p_out.getPublisher());