From eb6acfbabc12b8b7b00eb9c35d115ad9727d9e0e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 28 May 2021 10:50:44 +0200 Subject: [PATCH] [cleaning] removing non parsable relation.validationDate(s) --- dhp-common/pom.xml | 4 ++++ .../schema/oaf/utils/GraphCleaningFunctions.java | 16 +++++++++++++++- .../dhp/schema/oaf/utils/OafMapperUtilsTest.java | 16 ++++++++++++---- pom.xml | 8 +++++++- 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index acac3594fa..b1494f649a 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -21,6 +21,10 @@ org.apache.hadoop hadoop-common + + commons-validator + commons-validator + org.apache.spark spark-core_2.11 diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 15fff07c02..da253c681a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -7,11 +7,13 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.validator.GenericValidator; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; public class GraphCleaningFunctions extends CleaningFunctions { @@ -115,7 +117,13 @@ public class GraphCleaningFunctions extends CleaningFunctions { o.setCountry(ModelConstants.UNKNOWN_COUNTRY); } } else if (value instanceof Relation) { - // nothing to clean here + Relation r = (Relation) value; + + if (!isValidDate(r.getValidationDate())) { + r.setValidationDate(null); + r.setValidated(false); + } + } else if (value instanceof Result) { Result r = (Result) value; @@ -292,6 +300,12 @@ public class GraphCleaningFunctions extends CleaningFunctions { return value; } + protected static boolean isValidDate(String date) { + return Stream + .of(ModelSupport.DATE_TIME_FORMATS) + .anyMatch(format -> GenericValidator.isDate(date, format, false)); + } + // HELPERS private static boolean isValidAuthorName(Author a) { diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java index 7256d6489f..e8135f2019 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils; import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; +import java.time.format.DateTimeParseException; import java.util.HashSet; import java.util.List; import java.util.stream.Collectors; @@ -15,16 +16,23 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.*; public class OafMapperUtilsTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + @Test + public void testDateValidation() { + + assertTrue(GraphCleaningFunctions.isValidDate("2016-05-07T12:41:19.202Z")); + assertTrue(GraphCleaningFunctions.isValidDate("2020-09-10 11:08:52")); + assertTrue(GraphCleaningFunctions.isValidDate("2016-04-05")); + assertFalse(GraphCleaningFunctions.isValidDate("2016 April 05")); + + } + @Test public void testMergePubs() throws IOException { Publication p1 = read("publication_1.json", Publication.class); diff --git a/pom.xml b/pom.xml index 5e5fec3087..5b96816d96 100644 --- a/pom.xml +++ b/pom.xml @@ -200,6 +200,12 @@ ${dhp.commons.lang.version} + + commons-validator + commons-validator + 1.7 + + com.google.guava guava @@ -730,7 +736,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.5.10] + [2.5.11] [4.0.3] [6.0.5] [3.1.6]