From 758d27745d9e63b994e700896cd817eb74960446 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 27 Nov 2020 16:07:24 +0100 Subject: [PATCH] cleaning tab characters from text fields --- .../dhp/schema/oaf/CleaningFunctions.java | 16 ++-- .../dhp/schema/oaf/ResultTypeComparator.java | 17 ++-- .../dhp/schema/oaf/OafMapperUtilsTest.java | 85 +++++++++++-------- .../dhp/oa/dedup/model/Identifier.java | 2 +- 4 files changed, 69 insertions(+), 51 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 0fae82a8a..5f191e9a9 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -15,7 +15,7 @@ public class CleaningFunctions { public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; - public static final String NEWLINES = "(?:\\n|\\r)"; + public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final Set PID_BLACKLIST = new HashSet<>(); @@ -109,7 +109,7 @@ public class CleaningFunctions { .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) - .map(CleaningFunctions::removeNewLines) + .map(CleaningFunctions::cleanValue) .collect(Collectors.toList())); } if (Objects.nonNull(r.getTitle())) { @@ -120,7 +120,7 @@ public class CleaningFunctions { .stream() .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(sp.getValue())) - .map(CleaningFunctions::removeNewLines) + .map(CleaningFunctions::cleanValue) .collect(Collectors.toList())); } if (Objects.nonNull(r.getDescription())) { @@ -131,7 +131,7 @@ public class CleaningFunctions { .stream() .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(sp.getValue())) - .map(CleaningFunctions::removeNewLines) + .map(CleaningFunctions::cleanValue) .collect(Collectors.toList())); } if (Objects.nonNull(r.getPid())) { @@ -228,13 +228,13 @@ public class CleaningFunctions { return value; } - protected static StructuredProperty removeNewLines(StructuredProperty s) { - s.setValue(s.getValue().replaceAll(NEWLINES, " ")); + protected static StructuredProperty cleanValue(StructuredProperty s) { + s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); return s; } - protected static Field removeNewLines(Field s) { - s.setValue(s.getValue().replaceAll(NEWLINES, " ")); + protected static Field cleanValue(Field s) { + s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); return s; } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java index eb54599ce..089d71a0c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.schema.oaf; +import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; + import java.util.Comparator; import java.util.HashSet; import java.util.Optional; @@ -8,8 +10,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import com.google.common.collect.Sets; + import eu.dnetlib.dhp.schema.common.ModelConstants; -import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; public class ResultTypeComparator implements Comparator { @@ -64,10 +66,13 @@ public class ResultTypeComparator implements Comparator { } protected HashSet getCollectedFromIds(Result left) { - return Optional.ofNullable(left.getCollectedfrom()) - .map(cf -> cf.stream() - .map(c -> c.getKey()) - .collect(Collectors.toCollection(HashSet::new))) - .orElse(new HashSet<>()); + return Optional + .ofNullable(left.getCollectedfrom()) + .map( + cf -> cf + .stream() + .map(c -> c.getKey()) + .collect(Collectors.toCollection(HashSet::new))) + .orElse(new HashSet<>()); } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtilsTest.java index fa5720c95..93840d534 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtilsTest.java @@ -1,56 +1,69 @@ + package eu.dnetlib.dhp.schema.oaf; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import it.unimi.dsi.fastutil.Hash; -import org.apache.commons.io.IOUtils; -import org.jetbrains.annotations.NotNull; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.stream.Collectors; -import static org.junit.jupiter.api.Assertions.*; +import org.apache.commons.io.IOUtils; +import org.jetbrains.annotations.NotNull; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import it.unimi.dsi.fastutil.Hash; public class OafMapperUtilsTest { - private static ObjectMapper OBJECT_MAPPER = new ObjectMapper() - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + private static ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - @Test - public void testMergePubs() throws IOException { - Publication p1 = read("publication_1.json", Publication.class); - Publication p2 = read("publication_2.json", Publication.class); - Dataset d1 = read("dataset_1.json", Dataset.class); - Dataset d2 = read("dataset_2.json", Dataset.class); + @Test + public void testMergePubs() throws IOException { + Publication p1 = read("publication_1.json", Publication.class); + Publication p2 = read("publication_2.json", Publication.class); + Dataset d1 = read("dataset_1.json", Dataset.class); + Dataset d2 = read("dataset_2.json", Dataset.class); - assertEquals(p1.getCollectedfrom().size(), 1); - assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID); - assertEquals(d2.getCollectedfrom().size(), 1); - assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + assertEquals(p1.getCollectedfrom().size(), 1); + assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID); + assertEquals(d2.getCollectedfrom().size(), 1); + assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); - assertTrue(OafMapperUtils.mergeResults(p1, d2).getResulttype().getClassid().equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)); + assertTrue( + OafMapperUtils + .mergeResults(p1, d2) + .getResulttype() + .getClassid() + .equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)); - assertEquals(p2.getCollectedfrom().size(), 1); - assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); - assertEquals(d1.getCollectedfrom().size(), 1); - assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + assertEquals(p2.getCollectedfrom().size(), 1); + assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + assertEquals(d1.getCollectedfrom().size(), 1); + assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); - assertTrue(OafMapperUtils.mergeResults(p2, d1).getResulttype().getClassid().equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)); - } + assertTrue( + OafMapperUtils + .mergeResults(p2, d1) + .getResulttype() + .getClassid() + .equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)); + } - @NotNull - protected HashSet cfId(List collectedfrom) { - return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new)); - } + @NotNull + protected HashSet cfId(List collectedfrom) { + return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new)); + } - protected T read(String filename, Class clazz ) throws IOException { - final String json = IOUtils.toString(getClass().getResourceAsStream(filename)); - return OBJECT_MAPPER.readValue(json, clazz); - } + protected T read(String filename, Class clazz) throws IOException { + final String json = IOUtils.toString(getClass().getResourceAsStream(filename)); + return OBJECT_MAPPER.readValue(json, clazz); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java index cdc1fa24a..18f4f9b84 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java @@ -7,13 +7,13 @@ import java.text.SimpleDateFormat; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import com.google.common.collect.Sets; import eu.dnetlib.dhp.oa.dedup.DatePicker; import eu.dnetlib.dhp.schema.common.EntityType; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;