forked from D-Net/dnet-hadoop
delegating the date parsing to https://github.com/sisyphsu/dateparser
This commit is contained in:
parent
741077dbca
commit
a900bfb874
|
@ -22,8 +22,8 @@
|
||||||
<artifactId>hadoop-common</artifactId>
|
<artifactId>hadoop-common</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-validator</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>commons-validator</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.time.ZoneId;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
|
import java.time.format.DateTimeParseException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.validator.GenericValidator;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
@ -119,14 +127,42 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
} else if (value instanceof Relation) {
|
} else if (value instanceof Relation) {
|
||||||
Relation r = (Relation) value;
|
Relation r = (Relation) value;
|
||||||
|
|
||||||
if (!isValidDate(r.getValidationDate())) {
|
Optional<String> validationDate = doCleanDate(r.getValidationDate());
|
||||||
|
if (validationDate.isPresent()) {
|
||||||
|
r.setValidationDate(validationDate.get());
|
||||||
|
r.setValidated(true);
|
||||||
|
} else {
|
||||||
r.setValidationDate(null);
|
r.setValidationDate(null);
|
||||||
r.setValidated(false);
|
r.setValidated(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (value instanceof Result) {
|
} else if (value instanceof Result) {
|
||||||
|
|
||||||
Result r = (Result) value;
|
Result r = (Result) value;
|
||||||
|
|
||||||
|
if (Objects.nonNull(r.getDateofacceptance())) {
|
||||||
|
Optional<String> date = cleanDateField(r.getDateofacceptance());
|
||||||
|
if (date.isPresent()) {
|
||||||
|
r.getDateofacceptance().setValue(date.get());
|
||||||
|
} else {
|
||||||
|
r.setDateofacceptance(null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Objects.nonNull(r.getRelevantdate())) {
|
||||||
|
r
|
||||||
|
.setRelevantdate(
|
||||||
|
r
|
||||||
|
.getRelevantdate()
|
||||||
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||||
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
|
.map(sp -> {
|
||||||
|
sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue()));
|
||||||
|
return sp;
|
||||||
|
})
|
||||||
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
|
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
|
||||||
r.setPublisher(null);
|
r.setPublisher(null);
|
||||||
}
|
}
|
||||||
|
@ -222,6 +258,14 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
if (Objects.isNull(i.getRefereed())) {
|
if (Objects.isNull(i.getRefereed())) {
|
||||||
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
||||||
}
|
}
|
||||||
|
if (Objects.nonNull(i.getDateofacceptance())) {
|
||||||
|
Optional<String> date = cleanDateField(i.getDateofacceptance());
|
||||||
|
if (date.isPresent()) {
|
||||||
|
i.getDateofacceptance().setValue(date.get());
|
||||||
|
} else {
|
||||||
|
i.setDateofacceptance(null);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||||
|
@ -300,10 +344,34 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static boolean isValidDate(String date) {
|
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
||||||
return Stream
|
return Optional
|
||||||
.of(ModelSupport.DATE_TIME_FORMATS)
|
.ofNullable(dateofacceptance)
|
||||||
.anyMatch(format -> GenericValidator.isDate(date, format, false));
|
.map(Field::getValue)
|
||||||
|
.map(GraphCleaningFunctions::cleanDate)
|
||||||
|
.filter(Objects::nonNull);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static Optional<String> doCleanDate(String date) {
|
||||||
|
return Optional.ofNullable(cleanDate(date));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String cleanDate(final String inputDate) {
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(inputDate)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
final LocalDate date = DateParserUtils
|
||||||
|
.parseDate(inputDate.trim())
|
||||||
|
.toInstant()
|
||||||
|
.atZone(ZoneId.systemDefault())
|
||||||
|
.toLocalDate();
|
||||||
|
return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date);
|
||||||
|
} catch (DateTimeParseException e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// HELPERS
|
// HELPERS
|
||||||
|
|
|
@ -4,9 +4,12 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.time.format.DateTimeParseException;
|
import java.time.LocalDate;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -26,10 +29,105 @@ public class OafMapperUtilsTest {
|
||||||
@Test
|
@Test
|
||||||
public void testDateValidation() {
|
public void testDateValidation() {
|
||||||
|
|
||||||
assertTrue(GraphCleaningFunctions.isValidDate("2016-05-07T12:41:19.202Z"));
|
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
|
||||||
assertTrue(GraphCleaningFunctions.isValidDate("2020-09-10 11:08:52"));
|
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
|
||||||
assertTrue(GraphCleaningFunctions.isValidDate("2016-04-05"));
|
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
|
||||||
assertFalse(GraphCleaningFunctions.isValidDate("2016 April 05"));
|
|
||||||
|
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
|
||||||
|
|
||||||
|
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
|
||||||
|
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
|
||||||
|
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
|
||||||
|
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
|
||||||
|
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
|
||||||
|
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
|
||||||
|
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
|
||||||
|
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
|
||||||
|
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
|
||||||
|
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
|
||||||
|
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
|
||||||
|
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
|
||||||
|
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
|
||||||
|
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
|
||||||
|
assertEquals(
|
||||||
|
"2015-07-03",
|
||||||
|
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
|
||||||
|
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
|
||||||
|
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
|
||||||
|
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
|
||||||
|
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
|
||||||
|
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
|
||||||
|
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
|
||||||
|
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
|
||||||
|
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
|
||||||
|
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
|
||||||
|
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
|
||||||
|
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
|
||||||
|
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
|
||||||
|
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
|
||||||
|
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
|
||||||
|
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
|
||||||
|
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
|
||||||
|
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
|
||||||
|
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
|
||||||
|
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
|
||||||
|
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
|
||||||
|
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
|
||||||
|
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
|
||||||
|
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
|
||||||
|
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
|
||||||
|
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
|
||||||
|
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
|
||||||
|
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
|
||||||
|
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
|
||||||
|
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
|
||||||
|
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
|
||||||
|
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
|
||||||
|
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
|
||||||
|
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
|
||||||
|
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
|
||||||
|
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
|
||||||
|
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
|
||||||
|
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
|
||||||
|
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
|
||||||
|
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
|
||||||
|
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
|
||||||
|
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
|
||||||
|
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
|
||||||
|
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
|
||||||
|
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
|
||||||
|
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
|
||||||
|
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
|
||||||
|
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
|
||||||
|
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
|
||||||
|
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
|
||||||
|
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
|
||||||
|
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
|
||||||
|
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
|
||||||
|
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
|
||||||
|
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
|
||||||
|
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
|
||||||
|
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
|
||||||
|
assertEquals(
|
||||||
|
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
|
||||||
|
assertEquals(
|
||||||
|
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
|
||||||
|
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
|
||||||
|
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
|
||||||
|
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
|
||||||
|
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
|
||||||
|
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
|
||||||
|
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
|
||||||
|
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
|
||||||
|
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
|
||||||
|
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
|
||||||
|
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
|
||||||
|
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
|
||||||
|
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
|
||||||
|
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
|
||||||
|
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
|
||||||
|
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
|
||||||
|
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,87 +10,11 @@ import java.util.*;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||||
import net.sf.saxon.s9api.*;
|
import net.sf.saxon.s9api.*;
|
||||||
|
|
||||||
public class DateCleaner implements ExtensionFunction, Serializable {
|
public class DateCleaner implements ExtensionFunction, Serializable {
|
||||||
|
|
||||||
private final static List<Pattern> dateRegex = Arrays
|
|
||||||
.asList(
|
|
||||||
// Y-M-D
|
|
||||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
|
||||||
// M-D-Y
|
|
||||||
Pattern
|
|
||||||
.compile(
|
|
||||||
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
|
||||||
Pattern.MULTILINE),
|
|
||||||
// D-M-Y
|
|
||||||
Pattern
|
|
||||||
.compile(
|
|
||||||
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
|
||||||
Pattern.MULTILINE),
|
|
||||||
// Y
|
|
||||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE));
|
|
||||||
|
|
||||||
private final static Pattern incompleteDateRegex = Pattern
|
|
||||||
.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
|
|
||||||
|
|
||||||
private final static List<DateTimeFormatter> dformats = Arrays
|
|
||||||
.asList(
|
|
||||||
DateTimeFormatter
|
|
||||||
.ofPattern(
|
|
||||||
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
|
||||||
Locale.ENGLISH),
|
|
||||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN));
|
|
||||||
|
|
||||||
public String clean(final String inputDate) {
|
|
||||||
|
|
||||||
Optional<String> cleanedDate = dateRegex
|
|
||||||
.stream()
|
|
||||||
.map(
|
|
||||||
p -> {
|
|
||||||
final Matcher matcher = p.matcher(inputDate);
|
|
||||||
if (matcher.find())
|
|
||||||
return matcher.group(0);
|
|
||||||
else
|
|
||||||
return null;
|
|
||||||
})
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.map(m -> {
|
|
||||||
Optional<String> cleanDate = dformats
|
|
||||||
.stream()
|
|
||||||
.map(f -> {
|
|
||||||
try {
|
|
||||||
LocalDate parsedDate = LocalDate.parse(m, f);
|
|
||||||
if (parsedDate != null)
|
|
||||||
return parsedDate.toString();
|
|
||||||
else
|
|
||||||
return null;
|
|
||||||
} catch (Throwable e) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
)
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.findAny();
|
|
||||||
|
|
||||||
return cleanDate.orElse(null);
|
|
||||||
})
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.findAny();
|
|
||||||
|
|
||||||
if (cleanedDate.isPresent())
|
|
||||||
return cleanedDate.get();
|
|
||||||
|
|
||||||
final Matcher matcher = incompleteDateRegex.matcher(inputDate);
|
|
||||||
if (matcher.find()) {
|
|
||||||
final Integer year = Integer.parseInt(matcher.group(1));
|
|
||||||
final Integer month = Integer.parseInt(matcher.group(4) == null ? "01" : matcher.group(4));
|
|
||||||
return String.format("%d-%02d-01", year, month);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public QName getName() {
|
public QName getName() {
|
||||||
return new QName(QNAME_BASE_URI + "/dateISO", "dateISO");
|
return new QName(QNAME_BASE_URI + "/dateISO", "dateISO");
|
||||||
|
@ -117,4 +41,9 @@ public class DateCleaner implements ExtensionFunction, Serializable {
|
||||||
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||||
return new XdmAtomicValue(clean(currentValue));
|
return new XdmAtomicValue(clean(currentValue));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// for backward compatibility with the existing unit tests
|
||||||
|
public String clean(String date) {
|
||||||
|
return GraphCleaningFunctions.cleanDate(date);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,11 +51,11 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
||||||
@DisplayName("Test Date cleaner")
|
@DisplayName("Test Date cleaner")
|
||||||
public void testDateCleaner() throws Exception {
|
public void testDateCleaner() throws Exception {
|
||||||
DateCleaner dc = new DateCleaner();
|
DateCleaner dc = new DateCleaner();
|
||||||
assertEquals(dc.clean("20/09/1982"), "1982-09-20");
|
assertEquals("1982-09-20", dc.clean("20/09/1982"));
|
||||||
assertEquals(dc.clean("20-09-2002"), "2002-09-20");
|
assertEquals("2002-09-20", dc.clean("20-09-2002"));
|
||||||
assertEquals(dc.clean("2002-09-20"), "2002-09-20");
|
assertEquals("2002-09-20", dc.clean("2002-09-20"));
|
||||||
assertEquals(dc.clean("2002-9"), "2002-09-01");
|
assertEquals("2002-09-01", dc.clean("2002-9"));
|
||||||
assertEquals(dc.clean("2021"), "2021-01-01");
|
assertEquals("2021-01-01", dc.clean("2021"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -129,6 +129,8 @@ public class GraphCleaningFunctionsTest {
|
||||||
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
||||||
assertNull(p_out.getPublisher());
|
assertNull(p_out.getPublisher());
|
||||||
|
|
||||||
|
assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue());
|
||||||
|
|
||||||
final List<Instance> pci = p_cleaned.getInstance();
|
final List<Instance> pci = p_cleaned.getInstance();
|
||||||
assertNotNull(pci);
|
assertNotNull(pci);
|
||||||
assertEquals(1, pci.size());
|
assertEquals(1, pci.size());
|
||||||
|
|
|
@ -301,7 +301,7 @@
|
||||||
},
|
},
|
||||||
"trust": "0.9"
|
"trust": "0.9"
|
||||||
},
|
},
|
||||||
"value": "2016-01-01"
|
"value": "7 oct 1970"
|
||||||
},
|
},
|
||||||
"dateofcollection": "",
|
"dateofcollection": "",
|
||||||
"dateoftransformation": "2020-04-22T12:34:08.009Z",
|
"dateoftransformation": "2020-04-22T12:34:08.009Z",
|
||||||
|
|
8
pom.xml
8
pom.xml
|
@ -201,9 +201,9 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-validator</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>commons-validator</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
<version>1.7</version>
|
<version>1.0.7</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -736,7 +736,7 @@
|
||||||
<mockito-core.version>3.3.3</mockito-core.version>
|
<mockito-core.version>3.3.3</mockito-core.version>
|
||||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||||
<vtd.version>[2.12,3.0)</vtd.version>
|
<vtd.version>[2.12,3.0)</vtd.version>
|
||||||
<dhp-schemas.version>[2.5.11]</dhp-schemas.version>
|
<dhp-schemas.version>[2.5.12-SNAPSHOT]</dhp-schemas.version>
|
||||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||||
|
|
Loading…
Reference in New Issue