From f216277219a164ddb12b9c99610a368eac0212c1 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 12 Feb 2021 16:34:52 +0100 Subject: [PATCH] Implemented cleaning date --- .../dhp/transformation/xslt/Cleaner.java | 2 +- .../dhp/transformation/xslt/DateCleaner.java | 100 ++++++++++++++++++ .../xslt/XSLTTransformationFunction.java | 1 + .../transformation/TransformationJobTest.java | 30 ++++-- .../eu/dnetlib/dhp/transform/input_zenodo.xml | 2 +- .../eu/dnetlib/dhp/transform/zenodo_tr.xslt | 7 +- 6 files changed, 128 insertions(+), 14 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 8b7024cbe5..fbf47c2ff9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -35,7 +35,7 @@ public class Cleaner implements ExtensionFunction, Serializable { @Override public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { XdmValue r = xdmValues[0]; - if (r.size() == 0){ + if (r.size() == 0) { return new XdmAtomicValue(""); } final String currentValue = xdmValues[0].itemAt(0).getStringValue(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java new file mode 100644 index 0000000000..98bea8de41 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java @@ -0,0 +1,100 @@ +package eu.dnetlib.dhp.transformation.xslt; + +import net.sf.saxon.s9api.*; +import scala.Serializable; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class DateCleaner implements ExtensionFunction, Serializable { + + private final static List dateRegex = Arrays.asList( + //Y-M-D + Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), + //M-D-Y + Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), + //D-M-Y + Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), + //Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) + ); + + private final static Pattern incompleteDateRegex = Pattern.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE); + + private final static List dformats = Arrays.asList( + DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH), + DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) + ); + + public String clean(final String inputDate) { + + Optional cleanedDate = dateRegex.stream().map( + p -> { + final Matcher matcher = p.matcher(inputDate); + if (matcher.find()) + return matcher.group(0); + else + return null; + } + ).filter(Objects::nonNull) + .map(m -> { + Optional cleanDate = dformats.stream() + .map(f -> { + try { + LocalDate parsedDate = LocalDate.parse(m, f); + if (parsedDate != null) + return parsedDate.toString(); + else + return null; + } catch (Throwable e) { + return null; + } + } + + ).filter(Objects::nonNull).findAny(); + + return cleanDate.orElse(null); + }).filter(Objects::nonNull).findAny(); + + if (cleanedDate.isPresent()) + return cleanedDate.get(); + + final Matcher matcher = incompleteDateRegex.matcher(inputDate); + if (matcher.find()){ + final Integer year = Integer.parseInt(matcher.group(1)); + final Integer month = Integer.parseInt(matcher.group(4) == null ? "01":matcher.group(4)); + return String.format("%d-%02d-01",year, month); + } + return null; + } + + @Override + public QName getName() { + return new QName("http://eu/dnetlib/trasform/dates", "dateISO"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) + }; + } + + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0) { + return new XdmAtomicValue(""); + } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + return new XdmAtomicValue(clean(currentValue)); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index d8707cd767..d37832bb46 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -41,6 +41,7 @@ public class XSLTTransformationFunction implements MapFunctionADCP - 2019-05-29 + 2019 diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt index e67ed9dda5..23e57579ba 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt @@ -4,8 +4,9 @@ xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:vocabulary="http://eu/dnetlib/trasform/extension" + xmlns:dateCleaner="http://eu/dnetlib/trasform/dates" xmlns:dr="http://www.driver-repository.eu/namespace/dr" - exclude-result-prefixes="xsl vocabulary"> + exclude-result-prefixes="xsl vocabulary dateCleaner"> @@ -53,7 +54,7 @@ + select="dateCleaner:dateISO(normalize-space(//*[local-name()='date'][@dateType='Available']))"/> @@ -112,7 +113,7 @@ + select="dateCleaner:dateISO(normalize-space(//*[local-name()='publicationYear']))"/>