From 3b694074ffab57fda01d9fc58b6d5bde9a847132 Mon Sep 17 00:00:00 2001 From: Andreas Czerniak Date: Fri, 9 Apr 2021 14:35:30 +0200 Subject: [PATCH] add xslt, personname cleaner --- .../transformation/xslt/PersonCleaner.java | 206 ++++++++++++ .../xslt/XSLTTransformationFunction.java | 1 + .../transformation/xslt/utils/Capitalize.java | 14 + .../xslt/utils/DotAbbreviations.java | 12 + .../transformation/TransformationJobTest.java | 25 +- .../dnetlib/dhp/transform/input_omicsdi.xml | 60 ++++ .../scripts/xslt_cleaning_REST_OmicsDI.xsl | 297 ++++++++++++++++++ 7 files changed, 606 insertions(+), 9 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java new file mode 100644 index 000000000..069060722 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java @@ -0,0 +1,206 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + +import java.io.Serializable; +// import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.List; +import java.util.Set; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.hash.Hashing; + +import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize; +import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations; +import net.sf.saxon.s9api.ExtensionFunction; +import net.sf.saxon.s9api.ItemType; +import net.sf.saxon.s9api.OccurrenceIndicator; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.SequenceType; +import net.sf.saxon.s9api.XdmValue; + +//import eu.dnetlib.pace.clustering.NGramUtils; +//import eu.dnetlib.pace.util.Capitalise; +//import eu.dnetlib.pace.util.DotAbbreviations; + +public class PersonCleaner implements ExtensionFunction, Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + private List firstname = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + + private static Set particles = null; + + public PersonCleaner() { + + } + + public String normalize(String s) { + s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + +// s = s.replaceAll("[\\W&&[^,-]]", " "); + +// System.out.println("class Person: s: " + s); + +// s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " "); + s = s.replaceAll("[\\p{Punct}&&[^-,]]", " "); + s = s.replace("\\d", " "); + s = s.replace("\\n", " "); + s = s.replace("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (s.contains(",")) { + // System.out.println("class Person: s: " + s); + + String[] arr = s.split(","); + if (arr.length == 1) { + + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + firstname = splitTermsFirstName(arr[1]); +// System.out.println("class Person: surname: " + surname); +// System.out.println("class Person: firstname: " + firstname); + + fullname.addAll(surname); + fullname.addAll(firstname); + } + } else { + fullname = splitTerms(s); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + firstname = fullname.subList(0, lastInitialPosition + 1); + System.out.println("name: " + firstname); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + firstname.add(term); + } + } + } else if (lastInitialPosition == fullname.size()) { + surname = fullname.subList(lastInitialPosition - 1, fullname.size()); + firstname = fullname.subList(0, lastInitialPosition - 1); + } + + } + return null; + } + + private List splitTermsFirstName(String s) { + List list = Lists.newArrayList(); + for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (s.trim().matches("\\p{Lu}{2,3}")) { + String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase) + for (String p : parts) { + if (p.length() > 0) + list.add(p); + } + } else { + list.add(part); + } + + } + return list; + } + + private List splitTerms(String s) { + if (particles == null) { + // particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); + } + + List list = Lists.newArrayList(); + for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + // if (!particles.contains(part.toLowerCase())) { + list.add(part); + + // } + } + return list; + } + + public List getFirstname() { + return firstname; + } + + public List getSurname() { + return surname; + } + + public List getFullname() { + return fullname; + } + + public String hash() { + return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString(); + } + + public String getNormalisedFullname() { + return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) + : Joiner.on(" ").join(fullname); + // return isAccurate() ? + // Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) : + // Joiner.on(" ").join(fullname); + } + + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, new Capitalize())); + } + + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations())); + } + + public boolean isAccurate() { + return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty()); + } + + @Override + public QName getName() { + return new QName(QNAME_BASE_URI + "/person", "person"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + // TODO Auto-generated method stub + return null; + } + + @Override + public XdmValue call(XdmValue[] arguments) throws SaxonApiException { + // TODO Auto-generated method stub + return null; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index 430fbcf95..f803c7cbc 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -46,6 +46,7 @@ public class XSLTTransformationFunction implements MapFunction { + + @Override + public String apply(String s) { + return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase()); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java new file mode 100644 index 000000000..01174bf04 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java @@ -0,0 +1,12 @@ + +package eu.dnetlib.dhp.transformation.xslt.utils; + +import com.google.common.base.Function; + +public class DotAbbreviations implements Function { + + @Override + public String apply(String s) { + return s.length() == 1 ? s + "." : s; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index f3a0685ac..a46245f6d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -92,15 +92,19 @@ public class TransformationJobTest extends AbstractVocabularyTest { } @Test - @DisplayName("Test Transform Inst.&Them.v4 record XML with xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid") - public void testTransformITGv4() throws Exception { + @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite/oaiOpenAIRE") + public void testTransformMostlyUsedScript() throws Exception { + + String xslTransformationScript = ""; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"; + // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); // We Load the XSLT transformation Rule from the classpath - XSLTTransformationFunction tr = loadTransformationRule( - "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript); MetadataRecord result = tr.call(mr); @@ -110,15 +114,18 @@ public class TransformationJobTest extends AbstractVocabularyTest { } @Test - @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite") - public void testTransformMostlyUsedScript() throws Exception { + @DisplayName("Test Transform record XML with xslt_cleaning_REST_OmicsDI") + public void testTransformRestScript() throws Exception { + + String xslTransformationScript = ""; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl"; + // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); - mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_omicsdi.xml"))); // We Load the XSLT transformation Rule from the classpath - XSLTTransformationFunction tr = loadTransformationRule( - "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"); + XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript); MetadataRecord result = tr.call(mr); diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml new file mode 100644 index 000000000..b068b89e3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml @@ -0,0 +1,60 @@ + + + + _____OmicsDI::0000337c02d1b51030675d69407655da + PRJNA78295 + 2020-10-31T15:31:30.725Z + _____OmicsDI + + + + 0.235294117647059 + 0 + null + 0.0 + 0 + Sedimentitalea nanhaiensis DSM 24252 Genome sequencing and assembly + 8.20101314054644E-5 + omics_ena_project + Sedimentitalea nanhaiensis DSM 24252 + 14 + 0 + null + Genomics + 0.0 + + 571166 + Sedimentitalea nanhaiensis DSM 24252 + + 0.0 + false + PRJNA78295 + null + 13 + + + + + + https%3A%2F%2Fwww.omicsdi.org%2Fws%2Fdataset%2Fsearch + + + + + + + false + false + 0.9 + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl new file mode 100644 index 000000000..4ac24183f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl @@ -0,0 +1,297 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + UNKNOWN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +