dnet-docker/dnet-app/libs/dnet-common-mapping/src/main/java/eu/dnetlib/common/mapping/xslt/functions/XsltPersonCleaner.java

139 lines
4.1 KiB
Java

package eu.dnetlib.common.mapping.xslt.functions;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import eu.dnetlib.common.mapping.xslt.XsltTransformerFactory;
import net.sf.saxon.s9api.ExtensionFunction;
import net.sf.saxon.s9api.ItemType;
import net.sf.saxon.s9api.OccurrenceIndicator;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.SequenceType;
import net.sf.saxon.s9api.XdmAtomicValue;
import net.sf.saxon.s9api.XdmValue;
@Component
public class XsltPersonCleaner implements ExtensionFunction {
@Override
public QName getName() {
return new QName(XsltTransformerFactory.QNAME_BASE_URI + "/persons", "normalize");
}
@Override
public SequenceType getResultType() {
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] {
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
};
}
@Override
public XdmValue call(final XdmValue[] xdmValues) throws SaxonApiException {
final XdmValue r = xdmValues[0];
if (r.size() == 0) { return new XdmAtomicValue(""); }
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
return new XdmAtomicValue(normalize(currentValue));
}
public static String normalize(String s) {
final List<String> firstname = new ArrayList<>();
final List<String> surname = new ArrayList<>();
final List<String> fullname = new ArrayList<>();
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
s = s.replaceAll("\\{.+\\}", "");
s = s.replaceAll("\\s+-\\s+", "-");
s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
s = s.replace("\\d", " ");
s = s.replace("\\n", " ");
s = s.replace("\\.", " ");
s = s.replaceAll("\\s+", " ");
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname.addAll(splitTerms(arr[0]));
} else if (arr.length > 1) {
surname.addAll(splitTerms(arr[0]));
firstname.addAll(splitTermsFirstName(arr[1]));
fullname.addAll(surname);
fullname.addAll(firstname);
}
} else {
fullname.addAll(splitTerms(s));
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
}
}
if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini
firstname.addAll(fullname.subList(0, lastInitialPosition + 1));
surname.addAll(fullname.subList(lastInitialPosition + 1, fullname.size()));
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
for (final String term : fullname) {
if ((term.length() > 1) && term.equals(term.toUpperCase())) {
surname.add(term);
} else {
firstname.add(term);
}
}
} else if (lastInitialPosition == fullname.size()) {
surname.addAll(fullname.subList(lastInitialPosition - 1, fullname.size()));
firstname.addAll(fullname.subList(0, lastInitialPosition - 1));
}
}
return null;
}
private static List<String> splitTermsFirstName(final String s) {
final List<String> list = new ArrayList<>();
Arrays.stream(s.split(" "))
.filter(StringUtils::isNotBlank)
.forEach(part -> {
if (s.trim().matches("\\p{Lu}{2,3}")) {
final String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
for (final String p : parts) {
if (p.length() > 0) {
list.add(p);
}
}
} else {
list.add(part);
}
});
return list;
}
private static List<String> splitTerms(final String s) {
return Arrays.stream(s.split(" "))
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList());
}
}