xsl functions

This commit is contained in:
Michele Artini 2023-04-20 10:07:22 +02:00
parent 0beb8ea671
commit f348e5c1a0
6 changed files with 327 additions and 10 deletions

View File

@ -23,15 +23,15 @@
<artifactId>commons-codec</artifactId>
</dependency>
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
<version>1.0.11</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<exclusions>
<exclusion>
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Hadoop -->

View File

@ -0,0 +1,8 @@
package eu.dnetlib.data.mapping.xslt;
import net.sf.saxon.s9api.ExtensionFunction;
public interface DnetXsltFunction {
ExtensionFunction asExtensionFunction();
}

View File

@ -0,0 +1,73 @@
package eu.dnetlib.data.mapping.xslt;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import com.github.sisyphsu.dateparser.DateParserUtils;
import net.sf.saxon.s9api.ExtensionFunction;
import net.sf.saxon.s9api.ItemType;
import net.sf.saxon.s9api.OccurrenceIndicator;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.SequenceType;
import net.sf.saxon.s9api.XdmAtomicValue;
import net.sf.saxon.s9api.XdmValue;
@Component
public class XsltDateCleaner implements DnetXsltFunction {
public static final String DATE_FORMAT = "yyyy-MM-dd";
@Override
public ExtensionFunction asExtensionFunction() {
return new ExtensionFunction() {
@Override
public QName getName() {
return new QName(XsltTransformFactory.QNAME_BASE_URI + "/dateISO", "dateISO");
}
@Override
public SequenceType getResultType() {
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] {
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
};
}
@Override
public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException {
final XdmValue r = xdmValues[0];
if (r.size() == 0) { return new XdmAtomicValue(""); }
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
return new XdmAtomicValue(clean(currentValue));
}
public String clean(final String inputDate) {
if (StringUtils.isBlank(inputDate)) { return null; }
try {
final LocalDate date = DateParserUtils
.parseDate(inputDate.trim())
.toInstant()
.atZone(ZoneId.systemDefault())
.toLocalDate();
return DateTimeFormatter.ofPattern(DATE_FORMAT).format(date);
} catch (final DateTimeParseException e) {
return null;
}
}
};
}
}

View File

@ -0,0 +1,143 @@
package eu.dnetlib.data.mapping.xslt;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import net.sf.saxon.s9api.ExtensionFunction;
import net.sf.saxon.s9api.ItemType;
import net.sf.saxon.s9api.OccurrenceIndicator;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.SequenceType;
import net.sf.saxon.s9api.XdmAtomicValue;
import net.sf.saxon.s9api.XdmValue;
@Component
public class XsltPersonCleaner implements DnetXsltFunction {
public static String normalize(String s) {
final List<String> firstname = new ArrayList<>();
final List<String> surname = new ArrayList<>();
final List<String> fullname = new ArrayList<>();
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
s = s.replaceAll("\\{.+\\}", "");
s = s.replaceAll("\\s+-\\s+", "-");
s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
s = s.replace("\\d", " ");
s = s.replace("\\n", " ");
s = s.replace("\\.", " ");
s = s.replaceAll("\\s+", " ");
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname.addAll(splitTerms(arr[0]));
} else if (arr.length > 1) {
surname.addAll(splitTerms(arr[0]));
firstname.addAll(splitTermsFirstName(arr[1]));
fullname.addAll(surname);
fullname.addAll(firstname);
}
} else {
fullname.addAll(splitTerms(s));
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
}
}
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
firstname.addAll(fullname.subList(0, lastInitialPosition + 1));
surname.addAll(fullname.subList(lastInitialPosition + 1, fullname.size()));
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
for (final String term : fullname) {
if (term.length() > 1 && term.equals(term.toUpperCase())) {
surname.add(term);
} else {
firstname.add(term);
}
}
} else if (lastInitialPosition == fullname.size()) {
surname.addAll(fullname.subList(lastInitialPosition - 1, fullname.size()));
firstname.addAll(fullname.subList(0, lastInitialPosition - 1));
}
}
return null;
}
private static List<String> splitTermsFirstName(final String s) {
final List<String> list = new ArrayList<>();
Arrays.stream(s.split(" "))
.filter(StringUtils::isNotBlank)
.forEach(part -> {
if (s.trim().matches("\\p{Lu}{2,3}")) {
final String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
for (final String p : parts) {
if (p.length() > 0) {
list.add(p);
}
}
} else {
list.add(part);
}
});
return list;
}
private static List<String> splitTerms(final String s) {
return Arrays.stream(s.split(" "))
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList());
}
@Override
public ExtensionFunction asExtensionFunction() {
return new ExtensionFunction() {
@Override
public QName getName() {
return new QName(XsltTransformFactory.QNAME_BASE_URI + "/person", "normalize");
}
@Override
public SequenceType getResultType() {
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] {
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
};
}
@Override
public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException {
final XdmValue r = xdmValues[0];
if (r.size() == 0) { return new XdmAtomicValue(""); }
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
return new XdmAtomicValue(normalize(currentValue));
}
};
}
}

View File

@ -2,16 +2,21 @@ package eu.dnetlib.data.mapping.xslt;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import javax.xml.transform.stream.StreamSource;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import eu.dnetlib.data.mapping.RecordTransformer;
import eu.dnetlib.errors.TransformationException;
import eu.dnetlib.is.resource.SimpleResourceService;
import net.sf.saxon.s9api.ExtensionFunction;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
@ -22,20 +27,29 @@ import net.sf.saxon.s9api.XsltCompiler;
import net.sf.saxon.s9api.XsltExecutable;
import net.sf.saxon.s9api.XsltTransformer;
@Service
public class XsltTransformFactory {
public static final String QNAME_BASE_URI = "http://eu/dnetlib/transform";
private static final Log log = LogFactory.getLog(XsltTransformFactory.class);
@Autowired
private SimpleResourceService simpleResourceService;
@Autowired
private List<DnetXsltFunction> xsltFunctions;
public RecordTransformer<String, String> getTransformer(final String ruleId, final Map<String, String> initialParams) throws TransformationException {
try {
final String xsltText = simpleResourceService.getContent(ruleId);
final Processor processor = new Processor(false);
// TODO consider the following functions
// processor.registerExtensionFunction(cleanFunction);
// processor.registerExtensionFunction(new DateCleaner());
// processor.registerExtensionFunction(new PersonCleaner());
xsltFunctions.forEach(f -> {
final ExtensionFunction extFunction = f.asExtensionFunction();
processor.registerExtensionFunction(extFunction);
log.info("New XSLT function registered: " + extFunction.getName());
});
final XsltCompiler comp = processor.newXsltCompiler();
initialParams.forEach((k, v) -> {

View File

@ -0,0 +1,79 @@
package eu.dnetlib.data.mapping.xslt;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import eu.dnetlib.is.model.vocabulary.Synonym;
import eu.dnetlib.is.vocabulary.VocabularyService;
import net.sf.saxon.s9api.ExtensionFunction;
import net.sf.saxon.s9api.ItemType;
import net.sf.saxon.s9api.OccurrenceIndicator;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.SequenceType;
import net.sf.saxon.s9api.XdmAtomicValue;
import net.sf.saxon.s9api.XdmValue;
@Component
public class XsltVocabularyCleaner implements DnetXsltFunction {
@Autowired
private VocabularyService vocabularyService;
private static final String SEPARATOR = "@#@";
@Override
public ExtensionFunction asExtensionFunction() {
final Map<String, String> termsMap = new HashMap<>();
vocabularyService.listVocs().forEach(voc -> {
vocabularyService.listTerms(voc.getId()).forEach(term -> {
for (final Synonym s : term.getSynonyms()) {
final String k = (voc.getId() + SEPARATOR + s.getTerm()).toLowerCase();
final String v = term.getCode();
termsMap.put(k, v);
}
});
});
return new ExtensionFunction() {
@Override
public QName getName() {
return new QName(XsltTransformFactory.QNAME_BASE_URI + "/clean", "clean");
}
@Override
public SequenceType getResultType() {
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE);
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] {
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE),
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE)
};
}
@Override
public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException {
final XdmValue r = xdmValues[0];
if (r.size() == 0) { return new XdmAtomicValue(""); }
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
final String vocId = xdmValues[1].itemAt(0).getStringValue();
final String key = (vocId + SEPARATOR + currentValue).toLowerCase();
final String cleanedValue = termsMap.getOrDefault(key, currentValue);
return new XdmAtomicValue(StringUtils.isNotBlank(cleanedValue) ? cleanedValue : currentValue);
}
};
}
}