[cleaning] removing non parsable relation.validationDate(s)

This commit is contained in:
Claudio Atzori 2021-05-28 10:50:44 +02:00
parent 6e3a4e9237
commit eb6acfbabc
4 changed files with 38 additions and 6 deletions

View File

@ -21,6 +21,10 @@
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId> <artifactId>hadoop-common</artifactId>
</dependency> </dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>

View File

@ -7,11 +7,13 @@ import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.GenericValidator;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
public class GraphCleaningFunctions extends CleaningFunctions { public class GraphCleaningFunctions extends CleaningFunctions {
@ -115,7 +117,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
o.setCountry(ModelConstants.UNKNOWN_COUNTRY); o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
} }
} else if (value instanceof Relation) { } else if (value instanceof Relation) {
// nothing to clean here Relation r = (Relation) value;
if (!isValidDate(r.getValidationDate())) {
r.setValidationDate(null);
r.setValidated(false);
}
} else if (value instanceof Result) { } else if (value instanceof Result) {
Result r = (Result) value; Result r = (Result) value;
@ -292,6 +300,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return value; return value;
} }
protected static boolean isValidDate(String date) {
return Stream
.of(ModelSupport.DATE_TIME_FORMATS)
.anyMatch(format -> GenericValidator.isDate(date, format, false));
}
// HELPERS // HELPERS
private static boolean isValidAuthorName(Author a) { private static boolean isValidAuthorName(Author a) {

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException; import java.io.IOException;
import java.time.format.DateTimeParseException;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -15,16 +16,23 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public class OafMapperUtilsTest { public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testDateValidation() {
assertTrue(GraphCleaningFunctions.isValidDate("2016-05-07T12:41:19.202Z"));
assertTrue(GraphCleaningFunctions.isValidDate("2020-09-10 11:08:52"));
assertTrue(GraphCleaningFunctions.isValidDate("2016-04-05"));
assertFalse(GraphCleaningFunctions.isValidDate("2016 April 05"));
}
@Test @Test
public void testMergePubs() throws IOException { public void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class); Publication p1 = read("publication_1.json", Publication.class);

View File

@ -200,6 +200,12 @@
<version>${dhp.commons.lang.version}</version> <version>${dhp.commons.lang.version}</version>
</dependency> </dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.7</version>
</dependency>
<dependency> <dependency>
<groupId>com.google.guava</groupId> <groupId>com.google.guava</groupId>
<artifactId>guava</artifactId> <artifactId>guava</artifactId>
@ -730,7 +736,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.5.10]</dhp-schemas.version> <dhp-schemas.version>[2.5.11]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>