139 lines
4.1 KiB
Java
139 lines
4.1 KiB
Java
package eu.dnetlib.common.mapping.xslt.functions;
|
|
|
|
import java.text.Normalizer;
|
|
import java.util.ArrayList;
|
|
import java.util.Arrays;
|
|
import java.util.List;
|
|
import java.util.stream.Collectors;
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
import org.springframework.stereotype.Component;
|
|
|
|
import eu.dnetlib.common.mapping.xslt.XsltTransformerFactory;
|
|
import net.sf.saxon.s9api.ExtensionFunction;
|
|
import net.sf.saxon.s9api.ItemType;
|
|
import net.sf.saxon.s9api.OccurrenceIndicator;
|
|
import net.sf.saxon.s9api.QName;
|
|
import net.sf.saxon.s9api.SaxonApiException;
|
|
import net.sf.saxon.s9api.SequenceType;
|
|
import net.sf.saxon.s9api.XdmAtomicValue;
|
|
import net.sf.saxon.s9api.XdmValue;
|
|
|
|
@Component
|
|
public class XsltPersonCleaner implements ExtensionFunction {
|
|
|
|
@Override
|
|
public QName getName() {
|
|
return new QName(XsltTransformerFactory.QNAME_BASE_URI + "/persons", "normalize");
|
|
}
|
|
|
|
@Override
|
|
public SequenceType getResultType() {
|
|
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
|
}
|
|
|
|
@Override
|
|
public SequenceType[] getArgumentTypes() {
|
|
return new SequenceType[] {
|
|
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
|
|
};
|
|
}
|
|
|
|
@Override
|
|
public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException {
|
|
final XdmValue r = xdmValues[0];
|
|
if (r.size() == 0) { return new XdmAtomicValue(""); }
|
|
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
|
|
return new XdmAtomicValue(normalize(currentValue));
|
|
}
|
|
|
|
public static String normalize(String s) {
|
|
final List<String> firstname = new ArrayList<>();
|
|
final List<String> surname = new ArrayList<>();
|
|
final List<String> fullname = new ArrayList<>();
|
|
|
|
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
|
|
s = s.replaceAll("\\(.+\\)", "");
|
|
s = s.replaceAll("\\[.+\\]", "");
|
|
s = s.replaceAll("\\{.+\\}", "");
|
|
s = s.replaceAll("\\s+-\\s+", "-");
|
|
s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
|
|
s = s.replace("\\d", " ");
|
|
s = s.replace("\\n", " ");
|
|
s = s.replace("\\.", " ");
|
|
s = s.replaceAll("\\s+", " ");
|
|
|
|
if (s.contains(",")) {
|
|
final String[] arr = s.split(",");
|
|
if (arr.length == 1) {
|
|
fullname.addAll(splitTerms(arr[0]));
|
|
} else if (arr.length > 1) {
|
|
surname.addAll(splitTerms(arr[0]));
|
|
firstname.addAll(splitTermsFirstName(arr[1]));
|
|
fullname.addAll(surname);
|
|
fullname.addAll(firstname);
|
|
}
|
|
} else {
|
|
fullname.addAll(splitTerms(s));
|
|
|
|
int lastInitialPosition = fullname.size();
|
|
boolean hasSurnameInUpperCase = false;
|
|
|
|
for (int i = 0; i < fullname.size(); i++) {
|
|
final String term = fullname.get(i);
|
|
if (term.length() == 1) {
|
|
lastInitialPosition = i;
|
|
} else if (term.equals(term.toUpperCase())) {
|
|
hasSurnameInUpperCase = true;
|
|
}
|
|
}
|
|
if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini
|
|
firstname.addAll(fullname.subList(0, lastInitialPosition + 1));
|
|
surname.addAll(fullname.subList(lastInitialPosition + 1, fullname.size()));
|
|
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
|
|
for (final String term : fullname) {
|
|
if ((term.length() > 1) && term.equals(term.toUpperCase())) {
|
|
surname.add(term);
|
|
} else {
|
|
firstname.add(term);
|
|
}
|
|
}
|
|
} else if (lastInitialPosition == fullname.size()) {
|
|
surname.addAll(fullname.subList(lastInitialPosition - 1, fullname.size()));
|
|
firstname.addAll(fullname.subList(0, lastInitialPosition - 1));
|
|
}
|
|
|
|
}
|
|
return null;
|
|
}
|
|
|
|
private static List<String> splitTermsFirstName(final String s) {
|
|
|
|
final List<String> list = new ArrayList<>();
|
|
|
|
Arrays.stream(s.split(" "))
|
|
.filter(StringUtils::isNotBlank)
|
|
.forEach(part -> {
|
|
if (s.trim().matches("\\p{Lu}{2,3}")) {
|
|
final String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
|
|
for (final String p : parts) {
|
|
if (p.length() > 0) {
|
|
list.add(p);
|
|
}
|
|
}
|
|
} else {
|
|
list.add(part);
|
|
}
|
|
});
|
|
return list;
|
|
|
|
}
|
|
|
|
private static List<String> splitTerms(final String s) {
|
|
return Arrays.stream(s.split(" "))
|
|
.filter(StringUtils::isNotBlank)
|
|
.collect(Collectors.toList());
|
|
}
|
|
|
|
}
|