forked from D-Net/dnet-hadoop
Implemented cleaning date
This commit is contained in:
parent
17e6f1934e
commit
f216277219
|
@ -0,0 +1,100 @@
|
|||
package eu.dnetlib.dhp.transformation.xslt;
|
||||
|
||||
import net.sf.saxon.s9api.*;
|
||||
import scala.Serializable;
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
public class DateCleaner implements ExtensionFunction, Serializable {
|
||||
|
||||
private final static List<Pattern> dateRegex = Arrays.asList(
|
||||
//Y-M-D
|
||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||
//M-D-Y
|
||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
||||
//D-M-Y
|
||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
);
|
||||
|
||||
private final static Pattern incompleteDateRegex = Pattern.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
|
||||
|
||||
private final static List<DateTimeFormatter> dformats = Arrays.asList(
|
||||
DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH),
|
||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
);
|
||||
|
||||
public String clean(final String inputDate) {
|
||||
|
||||
Optional<String> cleanedDate = dateRegex.stream().map(
|
||||
p -> {
|
||||
final Matcher matcher = p.matcher(inputDate);
|
||||
if (matcher.find())
|
||||
return matcher.group(0);
|
||||
else
|
||||
return null;
|
||||
}
|
||||
).filter(Objects::nonNull)
|
||||
.map(m -> {
|
||||
Optional<String> cleanDate = dformats.stream()
|
||||
.map(f -> {
|
||||
try {
|
||||
LocalDate parsedDate = LocalDate.parse(m, f);
|
||||
if (parsedDate != null)
|
||||
return parsedDate.toString();
|
||||
else
|
||||
return null;
|
||||
} catch (Throwable e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
).filter(Objects::nonNull).findAny();
|
||||
|
||||
return cleanDate.orElse(null);
|
||||
}).filter(Objects::nonNull).findAny();
|
||||
|
||||
if (cleanedDate.isPresent())
|
||||
return cleanedDate.get();
|
||||
|
||||
final Matcher matcher = incompleteDateRegex.matcher(inputDate);
|
||||
if (matcher.find()){
|
||||
final Integer year = Integer.parseInt(matcher.group(1));
|
||||
final Integer month = Integer.parseInt(matcher.group(4) == null ? "01":matcher.group(4));
|
||||
return String.format("%d-%02d-01",year, month);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public QName getName() {
|
||||
return new QName("http://eu/dnetlib/trasform/dates", "dateISO");
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType() {
|
||||
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] {
|
||||
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
|
||||
XdmValue r = xdmValues[0];
|
||||
if (r.size() == 0) {
|
||||
return new XdmAtomicValue("");
|
||||
}
|
||||
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||
return new XdmAtomicValue(clean(currentValue));
|
||||
}
|
||||
}
|
|
@ -41,6 +41,7 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
|
|||
try {
|
||||
Processor processor = new Processor(false);
|
||||
processor.registerExtensionFunction(cleanFunction);
|
||||
processor.registerExtensionFunction(new DateCleaner());
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
XsltExecutable xslt = comp
|
||||
.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
|
||||
|
|
|
@ -8,9 +8,12 @@ import java.io.IOException;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import eu.dnetlib.dhp.transformation.xslt.DateCleaner;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -39,10 +42,10 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException, ISLookUpException {
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(CollectionJobTest.class.getSimpleName());
|
||||
conf.setMaster("local");
|
||||
spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
// SparkConf conf = new SparkConf();
|
||||
// conf.setAppName(CollectionJobTest.class.getSimpleName());
|
||||
// conf.setMaster("local");
|
||||
// spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
|
@ -52,9 +55,22 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
|
||||
@AfterAll
|
||||
public static void afterAll() {
|
||||
spark.stop();
|
||||
// spark.stop();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@DisplayName("Test Date cleaner")
|
||||
public void testDateCleaner() throws Exception {
|
||||
DateCleaner dc = new DateCleaner();
|
||||
assertEquals(dc.clean("20/09/1982"),"1982-09-20");
|
||||
assertEquals(dc.clean("20-09-2002"),"2002-09-20");
|
||||
assertEquals(dc.clean("2002-09-20"),"2002-09-20");
|
||||
assertEquals(dc.clean("2002-9"),"2002-09-01");
|
||||
assertEquals(dc.clean("2021"),"2021-01-01");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@DisplayName("Test Transform Single XML using XSLTTransformator")
|
||||
public void testTransformSaxonHE() throws Exception {
|
||||
|
@ -66,12 +82,8 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
|||
// We Load the XSLT transformation Rule from the classpath
|
||||
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt");
|
||||
|
||||
|
||||
MetadataRecord result = tr.call(mr);
|
||||
|
||||
|
||||
|
||||
|
||||
// Print the record
|
||||
System.out.println(result.getBody());
|
||||
// TODO Create significant Assert
|
||||
|
|
|
@ -56,7 +56,7 @@
|
|||
<subject>ADCP</subject>
|
||||
</subjects>
|
||||
<dates>
|
||||
<date dateType="Issued">2019-05-29</date>
|
||||
<date dateType="Available">2019</date>
|
||||
</dates>
|
||||
<resourceType resourceTypeGeneral="Dataset"/>
|
||||
<relatedIdentifiers>
|
||||
|
|
|
@ -4,8 +4,9 @@
|
|||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:vocabulary="http://eu/dnetlib/trasform/extension"
|
||||
xmlns:dateCleaner="http://eu/dnetlib/trasform/dates"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
exclude-result-prefixes="xsl vocabulary">
|
||||
exclude-result-prefixes="xsl vocabulary dateCleaner">
|
||||
<xsl:param name="varOfficialName"/>
|
||||
<xsl:param name="varDsType"/>
|
||||
<xsl:param name="varDataSourceId"/>
|
||||
|
@ -53,7 +54,7 @@
|
|||
|
||||
<xsl:if test="//*[local-name()='date']/@dateType='Available'">
|
||||
<xsl:variable name='varEmbargoEndDate'
|
||||
select="vocabulary:clean(normalize-space(//*[local-name()='date'][@dateType='Available']), 'DateISO8601')"/>
|
||||
select="dateCleaner:dateISO(normalize-space(//*[local-name()='date'][@dateType='Available']))"/>
|
||||
<xsl:choose>
|
||||
<xsl:when test="string-length($varEmbargoEndDate) > 0">
|
||||
<oaf:embargoenddate>
|
||||
|
@ -112,7 +113,7 @@
|
|||
|
||||
<oaf:dateAccepted>
|
||||
<xsl:value-of
|
||||
select="normalize-space(//*[local-name()='publicationYear'])"/>
|
||||
select="dateCleaner:dateISO(normalize-space(//*[local-name()='publicationYear']))"/>
|
||||
</oaf:dateAccepted>
|
||||
<xsl:choose>
|
||||
|
||||
|
|
Loading…
Reference in New Issue