1
0
Fork 0

changed xslt behaviour on failure

This commit is contained in:
Sandro La Bruzzo 2021-02-12 17:27:08 +01:00
parent 6a37c7f175
commit 7edcc87ed4
4 changed files with 103 additions and 91 deletions

View File

@ -106,7 +106,8 @@ public class TransformSparkJobNode {
log.info("Transformation Error item " + ct.getErrorItems().count()); log.info("Transformation Error item " + ct.getErrorItems().count());
writeHdfsFile( writeHdfsFile(
spark.sparkContext().hadoopConfiguration(), "" + mdstore.count(), outputBasePath + MDSTORE_SIZE_PATH); spark.sparkContext().hadoopConfiguration(),
"" + spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count(), outputBasePath + MDSTORE_SIZE_PATH);
} }
} }

View File

@ -1,47 +1,61 @@
package eu.dnetlib.dhp.transformation.xslt; package eu.dnetlib.dhp.transformation.xslt;
import net.sf.saxon.s9api.*;
import scala.Serializable;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
import java.util.*; import java.util.*;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.sf.saxon.s9api.*;
import scala.Serializable;
public class DateCleaner implements ExtensionFunction, Serializable { public class DateCleaner implements ExtensionFunction, Serializable {
private final static List<Pattern> dateRegex = Arrays.asList( private final static List<Pattern> dateRegex = Arrays
.asList(
// Y-M-D // Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
// M-D-Y // M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), Pattern
.compile(
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
Pattern.MULTILINE),
// D-M-Y // D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), Pattern
.compile(
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
Pattern.MULTILINE),
// Y // Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE));
);
private final static Pattern incompleteDateRegex = Pattern.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE); private final static Pattern incompleteDateRegex = Pattern
.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
private final static List<DateTimeFormatter> dformats = Arrays.asList( private final static List<DateTimeFormatter> dformats = Arrays
DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH), .asList(
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) DateTimeFormatter
); .ofPattern(
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
Locale.ENGLISH),
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN));
public String clean(final String inputDate) { public String clean(final String inputDate) {
Optional<String> cleanedDate = dateRegex.stream().map( Optional<String> cleanedDate = dateRegex
.stream()
.map(
p -> { p -> {
final Matcher matcher = p.matcher(inputDate); final Matcher matcher = p.matcher(inputDate);
if (matcher.find()) if (matcher.find())
return matcher.group(0); return matcher.group(0);
else else
return null; return null;
} })
).filter(Objects::nonNull) .filter(Objects::nonNull)
.map(m -> { .map(m -> {
Optional<String> cleanDate = dformats.stream() Optional<String> cleanDate = dformats
.stream()
.map(f -> { .map(f -> {
try { try {
LocalDate parsedDate = LocalDate.parse(m, f); LocalDate parsedDate = LocalDate.parse(m, f);
@ -54,10 +68,14 @@ public class DateCleaner implements ExtensionFunction, Serializable {
} }
} }
).filter(Objects::nonNull).findAny(); )
.filter(Objects::nonNull)
.findAny();
return cleanDate.orElse(null); return cleanDate.orElse(null);
}).filter(Objects::nonNull).findAny(); })
.filter(Objects::nonNull)
.findAny();
if (cleanedDate.isPresent()) if (cleanedDate.isPresent())
return cleanedDate.get(); return cleanedDate.get();

View File

@ -63,7 +63,7 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
return value; return value;
} catch (Throwable e) { } catch (Throwable e) {
aggregationCounter.getErrorItems().add(1); aggregationCounter.getErrorItems().add(1);
return null; throw new RuntimeException(e);
} }
} }
} }

View File

@ -11,7 +11,6 @@ import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import eu.dnetlib.dhp.transformation.xslt.DateCleaner;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -28,8 +27,8 @@ import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest;
import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.xslt.DateCleaner;
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -56,7 +55,6 @@ public class TransformationJobTest extends AbstractVocabularyTest {
spark.stop(); spark.stop();
} }
@Test @Test
@DisplayName("Test Date cleaner") @DisplayName("Test Date cleaner")
public void testDateCleaner() throws Exception { public void testDateCleaner() throws Exception {
@ -68,7 +66,6 @@ public class TransformationJobTest extends AbstractVocabularyTest {
assertEquals(dc.clean("2021"), "2021-01-01"); assertEquals(dc.clean("2021"), "2021-01-01");
} }
@Test @Test
@DisplayName("Test Transform Single XML using XSLTTransformator") @DisplayName("Test Transform Single XML using XSLTTransformator")
public void testTransformSaxonHE() throws Exception { public void testTransformSaxonHE() throws Exception {
@ -80,12 +77,8 @@ public class TransformationJobTest extends AbstractVocabularyTest {
// We Load the XSLT transformation Rule from the classpath // We Load the XSLT transformation Rule from the classpath
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt"); XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt");
MetadataRecord result = tr.call(mr); MetadataRecord result = tr.call(mr);
// Print the record // Print the record
System.out.println(result.getBody()); System.out.println(result.getBody());
// TODO Create significant Assert // TODO Create significant Assert