forked from antonis.lempesis/dnet-hadoop
changed xslt behaviour on failure
This commit is contained in:
parent
6a37c7f175
commit
7edcc87ed4
|
@ -106,7 +106,8 @@ public class TransformSparkJobNode {
|
||||||
log.info("Transformation Error item " + ct.getErrorItems().count());
|
log.info("Transformation Error item " + ct.getErrorItems().count());
|
||||||
|
|
||||||
writeHdfsFile(
|
writeHdfsFile(
|
||||||
spark.sparkContext().hadoopConfiguration(), "" + mdstore.count(), outputBasePath + MDSTORE_SIZE_PATH);
|
spark.sparkContext().hadoopConfiguration(),
|
||||||
|
"" + spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count(), outputBasePath + MDSTORE_SIZE_PATH);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,100 +1,118 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.transformation.xslt;
|
package eu.dnetlib.dhp.transformation.xslt;
|
||||||
|
|
||||||
import net.sf.saxon.s9api.*;
|
|
||||||
import scala.Serializable;
|
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import net.sf.saxon.s9api.*;
|
||||||
|
import scala.Serializable;
|
||||||
|
|
||||||
public class DateCleaner implements ExtensionFunction, Serializable {
|
public class DateCleaner implements ExtensionFunction, Serializable {
|
||||||
|
|
||||||
private final static List<Pattern> dateRegex = Arrays.asList(
|
private final static List<Pattern> dateRegex = Arrays
|
||||||
//Y-M-D
|
.asList(
|
||||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
// Y-M-D
|
||||||
//M-D-Y
|
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
// M-D-Y
|
||||||
//D-M-Y
|
Pattern
|
||||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
.compile(
|
||||||
//Y
|
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
||||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
Pattern.MULTILINE),
|
||||||
);
|
// D-M-Y
|
||||||
|
Pattern
|
||||||
|
.compile(
|
||||||
|
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
||||||
|
Pattern.MULTILINE),
|
||||||
|
// Y
|
||||||
|
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE));
|
||||||
|
|
||||||
private final static Pattern incompleteDateRegex = Pattern.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
|
private final static Pattern incompleteDateRegex = Pattern
|
||||||
|
.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
|
||||||
|
|
||||||
private final static List<DateTimeFormatter> dformats = Arrays.asList(
|
private final static List<DateTimeFormatter> dformats = Arrays
|
||||||
DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH),
|
.asList(
|
||||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
DateTimeFormatter
|
||||||
);
|
.ofPattern(
|
||||||
|
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
||||||
|
Locale.ENGLISH),
|
||||||
|
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN));
|
||||||
|
|
||||||
public String clean(final String inputDate) {
|
public String clean(final String inputDate) {
|
||||||
|
|
||||||
Optional<String> cleanedDate = dateRegex.stream().map(
|
Optional<String> cleanedDate = dateRegex
|
||||||
p -> {
|
.stream()
|
||||||
final Matcher matcher = p.matcher(inputDate);
|
.map(
|
||||||
if (matcher.find())
|
p -> {
|
||||||
return matcher.group(0);
|
final Matcher matcher = p.matcher(inputDate);
|
||||||
else
|
if (matcher.find())
|
||||||
return null;
|
return matcher.group(0);
|
||||||
}
|
else
|
||||||
).filter(Objects::nonNull)
|
return null;
|
||||||
.map(m -> {
|
})
|
||||||
Optional<String> cleanDate = dformats.stream()
|
.filter(Objects::nonNull)
|
||||||
.map(f -> {
|
.map(m -> {
|
||||||
try {
|
Optional<String> cleanDate = dformats
|
||||||
LocalDate parsedDate = LocalDate.parse(m, f);
|
.stream()
|
||||||
if (parsedDate != null)
|
.map(f -> {
|
||||||
return parsedDate.toString();
|
try {
|
||||||
else
|
LocalDate parsedDate = LocalDate.parse(m, f);
|
||||||
return null;
|
if (parsedDate != null)
|
||||||
} catch (Throwable e) {
|
return parsedDate.toString();
|
||||||
return null;
|
else
|
||||||
}
|
return null;
|
||||||
}
|
} catch (Throwable e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
).filter(Objects::nonNull).findAny();
|
)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.findAny();
|
||||||
|
|
||||||
return cleanDate.orElse(null);
|
return cleanDate.orElse(null);
|
||||||
}).filter(Objects::nonNull).findAny();
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.findAny();
|
||||||
|
|
||||||
if (cleanedDate.isPresent())
|
if (cleanedDate.isPresent())
|
||||||
return cleanedDate.get();
|
return cleanedDate.get();
|
||||||
|
|
||||||
final Matcher matcher = incompleteDateRegex.matcher(inputDate);
|
final Matcher matcher = incompleteDateRegex.matcher(inputDate);
|
||||||
if (matcher.find()){
|
if (matcher.find()) {
|
||||||
final Integer year = Integer.parseInt(matcher.group(1));
|
final Integer year = Integer.parseInt(matcher.group(1));
|
||||||
final Integer month = Integer.parseInt(matcher.group(4) == null ? "01":matcher.group(4));
|
final Integer month = Integer.parseInt(matcher.group(4) == null ? "01" : matcher.group(4));
|
||||||
return String.format("%d-%02d-01",year, month);
|
return String.format("%d-%02d-01", year, month);
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public QName getName() {
|
public QName getName() {
|
||||||
return new QName("http://eu/dnetlib/trasform/dates", "dateISO");
|
return new QName("http://eu/dnetlib/trasform/dates", "dateISO");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SequenceType getResultType() {
|
public SequenceType getResultType() {
|
||||||
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SequenceType[] getArgumentTypes() {
|
public SequenceType[] getArgumentTypes() {
|
||||||
return new SequenceType[] {
|
return new SequenceType[] {
|
||||||
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
|
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
|
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
|
||||||
XdmValue r = xdmValues[0];
|
XdmValue r = xdmValues[0];
|
||||||
if (r.size() == 0) {
|
if (r.size() == 0) {
|
||||||
return new XdmAtomicValue("");
|
return new XdmAtomicValue("");
|
||||||
}
|
}
|
||||||
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||||
return new XdmAtomicValue(clean(currentValue));
|
return new XdmAtomicValue(clean(currentValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,7 +63,7 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
|
||||||
return value;
|
return value;
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
aggregationCounter.getErrorItems().add(1);
|
aggregationCounter.getErrorItems().add(1);
|
||||||
return null;
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,6 @@ import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.transformation.xslt.DateCleaner;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -28,8 +27,8 @@ import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
|
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
|
||||||
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||||
|
import eu.dnetlib.dhp.transformation.xslt.DateCleaner;
|
||||||
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
|
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
|
||||||
|
@ -56,19 +55,17 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
||||||
spark.stop();
|
spark.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@DisplayName("Test Date cleaner")
|
@DisplayName("Test Date cleaner")
|
||||||
public void testDateCleaner() throws Exception {
|
public void testDateCleaner() throws Exception {
|
||||||
DateCleaner dc = new DateCleaner();
|
DateCleaner dc = new DateCleaner();
|
||||||
assertEquals(dc.clean("20/09/1982"),"1982-09-20");
|
assertEquals(dc.clean("20/09/1982"), "1982-09-20");
|
||||||
assertEquals(dc.clean("20-09-2002"),"2002-09-20");
|
assertEquals(dc.clean("20-09-2002"), "2002-09-20");
|
||||||
assertEquals(dc.clean("2002-09-20"),"2002-09-20");
|
assertEquals(dc.clean("2002-09-20"), "2002-09-20");
|
||||||
assertEquals(dc.clean("2002-9"),"2002-09-01");
|
assertEquals(dc.clean("2002-9"), "2002-09-01");
|
||||||
assertEquals(dc.clean("2021"),"2021-01-01");
|
assertEquals(dc.clean("2021"), "2021-01-01");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@DisplayName("Test Transform Single XML using XSLTTransformator")
|
@DisplayName("Test Transform Single XML using XSLTTransformator")
|
||||||
public void testTransformSaxonHE() throws Exception {
|
public void testTransformSaxonHE() throws Exception {
|
||||||
|
@ -80,12 +77,8 @@ public class TransformationJobTest extends AbstractVocabularyTest {
|
||||||
// We Load the XSLT transformation Rule from the classpath
|
// We Load the XSLT transformation Rule from the classpath
|
||||||
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt");
|
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt");
|
||||||
|
|
||||||
|
|
||||||
MetadataRecord result = tr.call(mr);
|
MetadataRecord result = tr.call(mr);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Print the record
|
// Print the record
|
||||||
System.out.println(result.getBody());
|
System.out.println(result.getBody());
|
||||||
// TODO Create significant Assert
|
// TODO Create significant Assert
|
||||||
|
|
Loading…
Reference in New Issue