From f348e5c1a0b4918720b1a7ed48e22ec3b01e4b01 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 20 Apr 2023 10:07:22 +0200 Subject: [PATCH] xsl functions --- libs/dnet-data-services/pom.xml | 12 +- .../data/mapping/xslt/DnetXsltFunction.java | 8 + .../data/mapping/xslt/XsltDateCleaner.java | 73 +++++++++ .../data/mapping/xslt/XsltPersonCleaner.java | 143 ++++++++++++++++++ .../mapping/xslt/XsltTransformFactory.java | 22 ++- .../mapping/xslt/XsltVocabularyCleaner.java | 79 ++++++++++ 6 files changed, 327 insertions(+), 10 deletions(-) create mode 100644 libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/DnetXsltFunction.java create mode 100644 libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltDateCleaner.java create mode 100644 libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltPersonCleaner.java create mode 100644 libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltVocabularyCleaner.java diff --git a/libs/dnet-data-services/pom.xml b/libs/dnet-data-services/pom.xml index e64b8ad3..5db07a4a 100644 --- a/libs/dnet-data-services/pom.xml +++ b/libs/dnet-data-services/pom.xml @@ -23,15 +23,15 @@ commons-codec + + com.github.sisyphsu + dateparser + 1.0.11 + + net.sf.saxon Saxon-HE - - - xml-apis - xml-apis - - diff --git a/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/DnetXsltFunction.java b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/DnetXsltFunction.java new file mode 100644 index 00000000..201cea8e --- /dev/null +++ b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/DnetXsltFunction.java @@ -0,0 +1,8 @@ +package eu.dnetlib.data.mapping.xslt; + +import net.sf.saxon.s9api.ExtensionFunction; + +public interface DnetXsltFunction { + + ExtensionFunction asExtensionFunction(); +} diff --git a/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltDateCleaner.java b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltDateCleaner.java new file mode 100644 index 00000000..0a8acc14 --- /dev/null +++ b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltDateCleaner.java @@ -0,0 +1,73 @@ +package eu.dnetlib.data.mapping.xslt; + +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; + +import org.apache.commons.lang3.StringUtils; +import org.springframework.stereotype.Component; + +import com.github.sisyphsu.dateparser.DateParserUtils; + +import net.sf.saxon.s9api.ExtensionFunction; +import net.sf.saxon.s9api.ItemType; +import net.sf.saxon.s9api.OccurrenceIndicator; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.SequenceType; +import net.sf.saxon.s9api.XdmAtomicValue; +import net.sf.saxon.s9api.XdmValue; + +@Component +public class XsltDateCleaner implements DnetXsltFunction { + + public static final String DATE_FORMAT = "yyyy-MM-dd"; + + @Override + public ExtensionFunction asExtensionFunction() { + return new ExtensionFunction() { + + @Override + public QName getName() { + return new QName(XsltTransformFactory.QNAME_BASE_URI + "/dateISO", "dateISO"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) + }; + } + + @Override + public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException { + final XdmValue r = xdmValues[0]; + if (r.size() == 0) { return new XdmAtomicValue(""); } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + return new XdmAtomicValue(clean(currentValue)); + } + + public String clean(final String inputDate) { + if (StringUtils.isBlank(inputDate)) { return null; } + + try { + final LocalDate date = DateParserUtils + .parseDate(inputDate.trim()) + .toInstant() + .atZone(ZoneId.systemDefault()) + .toLocalDate(); + return DateTimeFormatter.ofPattern(DATE_FORMAT).format(date); + } catch (final DateTimeParseException e) { + return null; + } + } + + }; + } +} diff --git a/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltPersonCleaner.java b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltPersonCleaner.java new file mode 100644 index 00000000..853909df --- /dev/null +++ b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltPersonCleaner.java @@ -0,0 +1,143 @@ +package eu.dnetlib.data.mapping.xslt; + +import java.text.Normalizer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; +import org.springframework.stereotype.Component; + +import net.sf.saxon.s9api.ExtensionFunction; +import net.sf.saxon.s9api.ItemType; +import net.sf.saxon.s9api.OccurrenceIndicator; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.SequenceType; +import net.sf.saxon.s9api.XdmAtomicValue; +import net.sf.saxon.s9api.XdmValue; + +@Component +public class XsltPersonCleaner implements DnetXsltFunction { + + public static String normalize(String s) { + final List firstname = new ArrayList<>(); + final List surname = new ArrayList<>(); + final List fullname = new ArrayList<>(); + + s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^-,]]", " "); + s = s.replace("\\d", " "); + s = s.replace("\\n", " "); + s = s.replace("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (s.contains(",")) { + final String[] arr = s.split(","); + if (arr.length == 1) { + fullname.addAll(splitTerms(arr[0])); + } else if (arr.length > 1) { + surname.addAll(splitTerms(arr[0])); + firstname.addAll(splitTermsFirstName(arr[1])); + fullname.addAll(surname); + fullname.addAll(firstname); + } + } else { + fullname.addAll(splitTerms(s)); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + final String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + firstname.addAll(fullname.subList(0, lastInitialPosition + 1)); + surname.addAll(fullname.subList(lastInitialPosition + 1, fullname.size())); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (final String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + firstname.add(term); + } + } + } else if (lastInitialPosition == fullname.size()) { + surname.addAll(fullname.subList(lastInitialPosition - 1, fullname.size())); + firstname.addAll(fullname.subList(0, lastInitialPosition - 1)); + } + + } + return null; + } + + private static List splitTermsFirstName(final String s) { + + final List list = new ArrayList<>(); + + Arrays.stream(s.split(" ")) + .filter(StringUtils::isNotBlank) + .forEach(part -> { + if (s.trim().matches("\\p{Lu}{2,3}")) { + final String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase) + for (final String p : parts) { + if (p.length() > 0) { + list.add(p); + } + } + } else { + list.add(part); + } + }); + return list; + + } + + private static List splitTerms(final String s) { + return Arrays.stream(s.split(" ")) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toList()); + } + + @Override + public ExtensionFunction asExtensionFunction() { + return new ExtensionFunction() { + + @Override + public QName getName() { + return new QName(XsltTransformFactory.QNAME_BASE_URI + "/person", "normalize"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) + }; + } + + @Override + public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException { + final XdmValue r = xdmValues[0]; + if (r.size() == 0) { return new XdmAtomicValue(""); } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + return new XdmAtomicValue(normalize(currentValue)); + } + + }; + } +} diff --git a/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltTransformFactory.java b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltTransformFactory.java index 2da38043..cd753841 100644 --- a/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltTransformFactory.java +++ b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltTransformFactory.java @@ -2,16 +2,21 @@ package eu.dnetlib.data.mapping.xslt; import java.io.StringWriter; import java.nio.charset.StandardCharsets; +import java.util.List; import java.util.Map; import javax.xml.transform.stream.StreamSource; import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; import eu.dnetlib.data.mapping.RecordTransformer; import eu.dnetlib.errors.TransformationException; import eu.dnetlib.is.resource.SimpleResourceService; +import net.sf.saxon.s9api.ExtensionFunction; import net.sf.saxon.s9api.Processor; import net.sf.saxon.s9api.QName; import net.sf.saxon.s9api.SaxonApiException; @@ -22,20 +27,29 @@ import net.sf.saxon.s9api.XsltCompiler; import net.sf.saxon.s9api.XsltExecutable; import net.sf.saxon.s9api.XsltTransformer; +@Service public class XsltTransformFactory { + public static final String QNAME_BASE_URI = "http://eu/dnetlib/transform"; + + private static final Log log = LogFactory.getLog(XsltTransformFactory.class); + @Autowired private SimpleResourceService simpleResourceService; + @Autowired + private List xsltFunctions; + public RecordTransformer getTransformer(final String ruleId, final Map initialParams) throws TransformationException { try { final String xsltText = simpleResourceService.getContent(ruleId); final Processor processor = new Processor(false); - // TODO consider the following functions - // processor.registerExtensionFunction(cleanFunction); - // processor.registerExtensionFunction(new DateCleaner()); - // processor.registerExtensionFunction(new PersonCleaner()); + xsltFunctions.forEach(f -> { + final ExtensionFunction extFunction = f.asExtensionFunction(); + processor.registerExtensionFunction(extFunction); + log.info("New XSLT function registered: " + extFunction.getName()); + }); final XsltCompiler comp = processor.newXsltCompiler(); initialParams.forEach((k, v) -> { diff --git a/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltVocabularyCleaner.java b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltVocabularyCleaner.java new file mode 100644 index 00000000..eefd3aa1 --- /dev/null +++ b/libs/dnet-data-services/src/main/java/eu/dnetlib/data/mapping/xslt/XsltVocabularyCleaner.java @@ -0,0 +1,79 @@ +package eu.dnetlib.data.mapping.xslt; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import eu.dnetlib.is.model.vocabulary.Synonym; +import eu.dnetlib.is.vocabulary.VocabularyService; +import net.sf.saxon.s9api.ExtensionFunction; +import net.sf.saxon.s9api.ItemType; +import net.sf.saxon.s9api.OccurrenceIndicator; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.SequenceType; +import net.sf.saxon.s9api.XdmAtomicValue; +import net.sf.saxon.s9api.XdmValue; + +@Component +public class XsltVocabularyCleaner implements DnetXsltFunction { + + @Autowired + private VocabularyService vocabularyService; + + private static final String SEPARATOR = "@#@"; + + @Override + public ExtensionFunction asExtensionFunction() { + + final Map termsMap = new HashMap<>(); + + vocabularyService.listVocs().forEach(voc -> { + vocabularyService.listTerms(voc.getId()).forEach(term -> { + for (final Synonym s : term.getSynonyms()) { + final String k = (voc.getId() + SEPARATOR + s.getTerm()).toLowerCase(); + final String v = term.getCode(); + termsMap.put(k, v); + } + }); + }); + + return new ExtensionFunction() { + + @Override + public QName getName() { + return new QName(XsltTransformFactory.QNAME_BASE_URI + "/clean", "clean"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE), + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) + }; + } + + @Override + public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException { + final XdmValue r = xdmValues[0]; + if (r.size() == 0) { return new XdmAtomicValue(""); } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + final String vocId = xdmValues[1].itemAt(0).getStringValue(); + + final String key = (vocId + SEPARATOR + currentValue).toLowerCase(); + final String cleanedValue = termsMap.getOrDefault(key, currentValue); + + return new XdmAtomicValue(StringUtils.isNotBlank(cleanedValue) ? cleanedValue : currentValue); + } + }; + } + +}