[cleaning] removing non parsable relation.validationDate(s)

This commit is contained in:
Claudio Atzori 2021-05-28 10:50:44 +02:00
parent 6e3a4e9237
commit eb6acfbabc
4 changed files with 38 additions and 6 deletions

View File

@ -21,6 +21,10 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>

View File

@ -7,11 +7,13 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.GenericValidator;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class GraphCleaningFunctions extends CleaningFunctions {
@ -115,7 +117,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
}
} else if (value instanceof Relation) {
// nothing to clean here
Relation r = (Relation) value;
if (!isValidDate(r.getValidationDate())) {
r.setValidationDate(null);
r.setValidated(false);
}
} else if (value instanceof Result) {
Result r = (Result) value;
@ -292,6 +300,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return value;
}
protected static boolean isValidDate(String date) {
return Stream
.of(ModelSupport.DATE_TIME_FORMATS)
.anyMatch(format -> GenericValidator.isDate(date, format, false));
}
// HELPERS
private static boolean isValidAuthorName(Author a) {

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.time.format.DateTimeParseException;
import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
@ -15,16 +16,23 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testDateValidation() {
assertTrue(GraphCleaningFunctions.isValidDate("2016-05-07T12:41:19.202Z"));
assertTrue(GraphCleaningFunctions.isValidDate("2020-09-10 11:08:52"));
assertTrue(GraphCleaningFunctions.isValidDate("2016-04-05"));
assertFalse(GraphCleaningFunctions.isValidDate("2016 April 05"));
}
@Test
public void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class);

View File

@ -200,6 +200,12 @@
<version>${dhp.commons.lang.version}</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.7</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
@ -730,7 +736,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.5.10]</dhp-schemas.version>
<dhp-schemas.version>[2.5.11]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>