From 956da2f923bf622b66c6c9695e21367656f303a9 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 13 Feb 2020 16:49:45 +0100 Subject: [PATCH] added Saxon-HE extension functions and Transformer factory class --- dhp-common/pom.xml | 4 ++ .../saxon/AbstractExtensionFunction.java | 32 +++++++++ .../dnetlib/dhp/utils/saxon/ExtractYear.java | 67 +++++++++++++++++++ .../dhp/utils/saxon/NormalizeDate.java | 66 ++++++++++++++++++ .../eu/dnetlib/dhp/utils/saxon/PickFirst.java | 53 +++++++++++++++ .../utils/saxon/SaxonTransformerFactory.java | 30 +++++++++ 6 files changed, 252 insertions(+) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 43c2a3834..ae7302b98 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -42,6 +42,10 @@ com.rabbitmq amqp-client + + net.sf.saxon + Saxon-HE + diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java new file mode 100644 index 000000000..bd3962440 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java @@ -0,0 +1,32 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.lib.ExtensionFunctionCall; +import net.sf.saxon.lib.ExtensionFunctionDefinition; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.om.StructuredQName; +import net.sf.saxon.trans.XPathException; + +public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition { + + public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; + + public abstract String getName(); + public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException; + + @Override + public StructuredQName getFunctionQName() { + return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName()); + } + + @Override + public ExtensionFunctionCall makeCallExpression() { + return new ExtensionFunctionCall() { + @Override + public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException { + return doCall(context, arguments); + } + }; + } + +} \ No newline at end of file diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java new file mode 100644 index 000000000..f90e2a23e --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java @@ -0,0 +1,67 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Item; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.GregorianCalendar; + +public class ExtractYear extends AbstractExtensionFunction { + + private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" }; + + @Override + public String getName() { + return "extractYear"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + final Item item = arguments[0].head(); + if (item == null) { + return new StringValue(""); + } + return new StringValue(_year(item.getStringValue())); + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 1; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + + private String _year(String s) { + Calendar c = new GregorianCalendar(); + for (String format : dateFormats) { + try { + c.setTime(new SimpleDateFormat(format).parse(s)); + String year = String.valueOf(c.get(Calendar.YEAR)); + return year; + } catch (ParseException e) {} + } + return ""; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java new file mode 100644 index 000000000..634e08788 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -0,0 +1,66 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; + +public class NormalizeDate extends AbstractExtensionFunction { + + private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" }; + + private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + + @Override + public String getName() { + return "normalizeDate"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + String s = arguments[0].head().getStringValue(); + return new StringValue(_year(s)); + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 1; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + + private String _year(String s) { + final String date = s != null ? s.trim() : ""; + + for (String format : normalizeDateFormats) { + try { + Date parse = new SimpleDateFormat(format).parse(date); + String res = new SimpleDateFormat(normalizeOutFormat).format(parse); + return res; + } catch (ParseException e) {} + } + return ""; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java new file mode 100644 index 000000000..1f209bed0 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java @@ -0,0 +1,53 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; +import org.apache.commons.lang3.StringUtils; + +public class PickFirst extends AbstractExtensionFunction { + + @Override + public String getName() { + return "pickFirst"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + String s1 = arguments[0].head().getStringValue(); + + if (arguments.length > 1) { + String s2 = arguments[1].head().getStringValue(); + + return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); + } else { + return new StringValue(StringUtils.isNotBlank(s1) ? s1 : ""); + } + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 2; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java new file mode 100644 index 000000000..611709ff0 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java @@ -0,0 +1,30 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.Configuration; +import net.sf.saxon.TransformerFactoryImpl; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.stream.StreamSource; +import java.io.StringReader; + +public class SaxonTransformerFactory { + + /** + * Creates the index record transformer from the given XSLT + * @param xslt + * @return + * @throws TransformerException + */ + public static Transformer newInstance(final String xslt) throws TransformerException { + + final TransformerFactoryImpl factory = new TransformerFactoryImpl(); + final Configuration conf = factory.getConfiguration(); + conf.registerExtensionFunction(new ExtractYear()); + conf.registerExtensionFunction(new NormalizeDate()); + conf.registerExtensionFunction(new PickFirst()); + + return factory.newTransformer(new StreamSource(new StringReader(xslt))); + } + +}