xsl functions
This commit is contained in:
parent
0beb8ea671
commit
f348e5c1a0
|
@ -23,15 +23,15 @@
|
|||
<artifactId>commons-codec</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.github.sisyphsu</groupId>
|
||||
<artifactId>dateparser</artifactId>
|
||||
<version>1.0.11</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>xml-apis</groupId>
|
||||
<artifactId>xml-apis</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<!-- Hadoop -->
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
package eu.dnetlib.data.mapping.xslt;
|
||||
|
||||
import net.sf.saxon.s9api.ExtensionFunction;
|
||||
|
||||
public interface DnetXsltFunction {
|
||||
|
||||
ExtensionFunction asExtensionFunction();
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
package eu.dnetlib.data.mapping.xslt;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeParseException;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||
|
||||
import net.sf.saxon.s9api.ExtensionFunction;
|
||||
import net.sf.saxon.s9api.ItemType;
|
||||
import net.sf.saxon.s9api.OccurrenceIndicator;
|
||||
import net.sf.saxon.s9api.QName;
|
||||
import net.sf.saxon.s9api.SaxonApiException;
|
||||
import net.sf.saxon.s9api.SequenceType;
|
||||
import net.sf.saxon.s9api.XdmAtomicValue;
|
||||
import net.sf.saxon.s9api.XdmValue;
|
||||
|
||||
@Component
|
||||
public class XsltDateCleaner implements DnetXsltFunction {
|
||||
|
||||
public static final String DATE_FORMAT = "yyyy-MM-dd";
|
||||
|
||||
@Override
|
||||
public ExtensionFunction asExtensionFunction() {
|
||||
return new ExtensionFunction() {
|
||||
|
||||
@Override
|
||||
public QName getName() {
|
||||
return new QName(XsltTransformFactory.QNAME_BASE_URI + "/dateISO", "dateISO");
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType() {
|
||||
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] {
|
||||
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException {
|
||||
final XdmValue r = xdmValues[0];
|
||||
if (r.size() == 0) { return new XdmAtomicValue(""); }
|
||||
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||
return new XdmAtomicValue(clean(currentValue));
|
||||
}
|
||||
|
||||
public String clean(final String inputDate) {
|
||||
if (StringUtils.isBlank(inputDate)) { return null; }
|
||||
|
||||
try {
|
||||
final LocalDate date = DateParserUtils
|
||||
.parseDate(inputDate.trim())
|
||||
.toInstant()
|
||||
.atZone(ZoneId.systemDefault())
|
||||
.toLocalDate();
|
||||
return DateTimeFormatter.ofPattern(DATE_FORMAT).format(date);
|
||||
} catch (final DateTimeParseException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,143 @@
|
|||
package eu.dnetlib.data.mapping.xslt;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import net.sf.saxon.s9api.ExtensionFunction;
|
||||
import net.sf.saxon.s9api.ItemType;
|
||||
import net.sf.saxon.s9api.OccurrenceIndicator;
|
||||
import net.sf.saxon.s9api.QName;
|
||||
import net.sf.saxon.s9api.SaxonApiException;
|
||||
import net.sf.saxon.s9api.SequenceType;
|
||||
import net.sf.saxon.s9api.XdmAtomicValue;
|
||||
import net.sf.saxon.s9api.XdmValue;
|
||||
|
||||
@Component
|
||||
public class XsltPersonCleaner implements DnetXsltFunction {
|
||||
|
||||
public static String normalize(String s) {
|
||||
final List<String> firstname = new ArrayList<>();
|
||||
final List<String> surname = new ArrayList<>();
|
||||
final List<String> fullname = new ArrayList<>();
|
||||
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
|
||||
s = s.replaceAll("\\(.+\\)", "");
|
||||
s = s.replaceAll("\\[.+\\]", "");
|
||||
s = s.replaceAll("\\{.+\\}", "");
|
||||
s = s.replaceAll("\\s+-\\s+", "-");
|
||||
s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
|
||||
s = s.replace("\\d", " ");
|
||||
s = s.replace("\\n", " ");
|
||||
s = s.replace("\\.", " ");
|
||||
s = s.replaceAll("\\s+", " ");
|
||||
|
||||
if (s.contains(",")) {
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
fullname.addAll(splitTerms(arr[0]));
|
||||
} else if (arr.length > 1) {
|
||||
surname.addAll(splitTerms(arr[0]));
|
||||
firstname.addAll(splitTermsFirstName(arr[1]));
|
||||
fullname.addAll(surname);
|
||||
fullname.addAll(firstname);
|
||||
}
|
||||
} else {
|
||||
fullname.addAll(splitTerms(s));
|
||||
|
||||
int lastInitialPosition = fullname.size();
|
||||
boolean hasSurnameInUpperCase = false;
|
||||
|
||||
for (int i = 0; i < fullname.size(); i++) {
|
||||
final String term = fullname.get(i);
|
||||
if (term.length() == 1) {
|
||||
lastInitialPosition = i;
|
||||
} else if (term.equals(term.toUpperCase())) {
|
||||
hasSurnameInUpperCase = true;
|
||||
}
|
||||
}
|
||||
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||
firstname.addAll(fullname.subList(0, lastInitialPosition + 1));
|
||||
surname.addAll(fullname.subList(lastInitialPosition + 1, fullname.size()));
|
||||
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
|
||||
for (final String term : fullname) {
|
||||
if (term.length() > 1 && term.equals(term.toUpperCase())) {
|
||||
surname.add(term);
|
||||
} else {
|
||||
firstname.add(term);
|
||||
}
|
||||
}
|
||||
} else if (lastInitialPosition == fullname.size()) {
|
||||
surname.addAll(fullname.subList(lastInitialPosition - 1, fullname.size()));
|
||||
firstname.addAll(fullname.subList(0, lastInitialPosition - 1));
|
||||
}
|
||||
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static List<String> splitTermsFirstName(final String s) {
|
||||
|
||||
final List<String> list = new ArrayList<>();
|
||||
|
||||
Arrays.stream(s.split(" "))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.forEach(part -> {
|
||||
if (s.trim().matches("\\p{Lu}{2,3}")) {
|
||||
final String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
|
||||
for (final String p : parts) {
|
||||
if (p.length() > 0) {
|
||||
list.add(p);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
list.add(part);
|
||||
}
|
||||
});
|
||||
return list;
|
||||
|
||||
}
|
||||
|
||||
private static List<String> splitTerms(final String s) {
|
||||
return Arrays.stream(s.split(" "))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExtensionFunction asExtensionFunction() {
|
||||
return new ExtensionFunction() {
|
||||
|
||||
@Override
|
||||
public QName getName() {
|
||||
return new QName(XsltTransformFactory.QNAME_BASE_URI + "/person", "normalize");
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType() {
|
||||
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] {
|
||||
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException {
|
||||
final XdmValue r = xdmValues[0];
|
||||
if (r.size() == 0) { return new XdmAtomicValue(""); }
|
||||
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||
return new XdmAtomicValue(normalize(currentValue));
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
}
|
|
@ -2,16 +2,21 @@ package eu.dnetlib.data.mapping.xslt;
|
|||
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import eu.dnetlib.data.mapping.RecordTransformer;
|
||||
import eu.dnetlib.errors.TransformationException;
|
||||
import eu.dnetlib.is.resource.SimpleResourceService;
|
||||
import net.sf.saxon.s9api.ExtensionFunction;
|
||||
import net.sf.saxon.s9api.Processor;
|
||||
import net.sf.saxon.s9api.QName;
|
||||
import net.sf.saxon.s9api.SaxonApiException;
|
||||
|
@ -22,20 +27,29 @@ import net.sf.saxon.s9api.XsltCompiler;
|
|||
import net.sf.saxon.s9api.XsltExecutable;
|
||||
import net.sf.saxon.s9api.XsltTransformer;
|
||||
|
||||
@Service
|
||||
public class XsltTransformFactory {
|
||||
|
||||
public static final String QNAME_BASE_URI = "http://eu/dnetlib/transform";
|
||||
|
||||
private static final Log log = LogFactory.getLog(XsltTransformFactory.class);
|
||||
|
||||
@Autowired
|
||||
private SimpleResourceService simpleResourceService;
|
||||
|
||||
@Autowired
|
||||
private List<DnetXsltFunction> xsltFunctions;
|
||||
|
||||
public RecordTransformer<String, String> getTransformer(final String ruleId, final Map<String, String> initialParams) throws TransformationException {
|
||||
try {
|
||||
final String xsltText = simpleResourceService.getContent(ruleId);
|
||||
|
||||
final Processor processor = new Processor(false);
|
||||
// TODO consider the following functions
|
||||
// processor.registerExtensionFunction(cleanFunction);
|
||||
// processor.registerExtensionFunction(new DateCleaner());
|
||||
// processor.registerExtensionFunction(new PersonCleaner());
|
||||
xsltFunctions.forEach(f -> {
|
||||
final ExtensionFunction extFunction = f.asExtensionFunction();
|
||||
processor.registerExtensionFunction(extFunction);
|
||||
log.info("New XSLT function registered: " + extFunction.getName());
|
||||
});
|
||||
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
initialParams.forEach((k, v) -> {
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
package eu.dnetlib.data.mapping.xslt;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import eu.dnetlib.is.model.vocabulary.Synonym;
|
||||
import eu.dnetlib.is.vocabulary.VocabularyService;
|
||||
import net.sf.saxon.s9api.ExtensionFunction;
|
||||
import net.sf.saxon.s9api.ItemType;
|
||||
import net.sf.saxon.s9api.OccurrenceIndicator;
|
||||
import net.sf.saxon.s9api.QName;
|
||||
import net.sf.saxon.s9api.SaxonApiException;
|
||||
import net.sf.saxon.s9api.SequenceType;
|
||||
import net.sf.saxon.s9api.XdmAtomicValue;
|
||||
import net.sf.saxon.s9api.XdmValue;
|
||||
|
||||
@Component
|
||||
public class XsltVocabularyCleaner implements DnetXsltFunction {
|
||||
|
||||
@Autowired
|
||||
private VocabularyService vocabularyService;
|
||||
|
||||
private static final String SEPARATOR = "@#@";
|
||||
|
||||
@Override
|
||||
public ExtensionFunction asExtensionFunction() {
|
||||
|
||||
final Map<String, String> termsMap = new HashMap<>();
|
||||
|
||||
vocabularyService.listVocs().forEach(voc -> {
|
||||
vocabularyService.listTerms(voc.getId()).forEach(term -> {
|
||||
for (final Synonym s : term.getSynonyms()) {
|
||||
final String k = (voc.getId() + SEPARATOR + s.getTerm()).toLowerCase();
|
||||
final String v = term.getCode();
|
||||
termsMap.put(k, v);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return new ExtensionFunction() {
|
||||
|
||||
@Override
|
||||
public QName getName() {
|
||||
return new QName(XsltTransformFactory.QNAME_BASE_URI + "/clean", "clean");
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType() {
|
||||
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] {
|
||||
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE),
|
||||
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE)
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException {
|
||||
final XdmValue r = xdmValues[0];
|
||||
if (r.size() == 0) { return new XdmAtomicValue(""); }
|
||||
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
||||
final String vocId = xdmValues[1].itemAt(0).getStringValue();
|
||||
|
||||
final String key = (vocId + SEPARATOR + currentValue).toLowerCase();
|
||||
final String cleanedValue = termsMap.getOrDefault(key, currentValue);
|
||||
|
||||
return new XdmAtomicValue(StringUtils.isNotBlank(cleanedValue) ? cleanedValue : currentValue);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue