From 34df35926cebae09f303538702b607097aa6ccb5 Mon Sep 17 00:00:00 2001 From: Andreas Czerniak Date: Fri, 9 Apr 2021 14:35:30 +0200 Subject: [PATCH 1/4] add xslt, personname cleaner --- .../transformation/xslt/PersonCleaner.java | 206 ++++++++++++ .../xslt/XSLTTransformationFunction.java | 1 + .../transformation/xslt/utils/Capitalize.java | 14 + .../xslt/utils/DotAbbreviations.java | 12 + .../transformation/TransformationJobTest.java | 25 +- .../dnetlib/dhp/transform/input_omicsdi.xml | 60 ++++ .../scripts/xslt_cleaning_REST_OmicsDI.xsl | 297 ++++++++++++++++++ 7 files changed, 606 insertions(+), 9 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java new file mode 100644 index 000000000..069060722 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java @@ -0,0 +1,206 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + +import java.io.Serializable; +// import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.List; +import java.util.Set; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.hash.Hashing; + +import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize; +import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations; +import net.sf.saxon.s9api.ExtensionFunction; +import net.sf.saxon.s9api.ItemType; +import net.sf.saxon.s9api.OccurrenceIndicator; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.SequenceType; +import net.sf.saxon.s9api.XdmValue; + +//import eu.dnetlib.pace.clustering.NGramUtils; +//import eu.dnetlib.pace.util.Capitalise; +//import eu.dnetlib.pace.util.DotAbbreviations; + +public class PersonCleaner implements ExtensionFunction, Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + private List firstname = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + + private static Set particles = null; + + public PersonCleaner() { + + } + + public String normalize(String s) { + s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + +// s = s.replaceAll("[\\W&&[^,-]]", " "); + +// System.out.println("class Person: s: " + s); + +// s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " "); + s = s.replaceAll("[\\p{Punct}&&[^-,]]", " "); + s = s.replace("\\d", " "); + s = s.replace("\\n", " "); + s = s.replace("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (s.contains(",")) { + // System.out.println("class Person: s: " + s); + + String[] arr = s.split(","); + if (arr.length == 1) { + + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + firstname = splitTermsFirstName(arr[1]); +// System.out.println("class Person: surname: " + surname); +// System.out.println("class Person: firstname: " + firstname); + + fullname.addAll(surname); + fullname.addAll(firstname); + } + } else { + fullname = splitTerms(s); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + firstname = fullname.subList(0, lastInitialPosition + 1); + System.out.println("name: " + firstname); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + firstname.add(term); + } + } + } else if (lastInitialPosition == fullname.size()) { + surname = fullname.subList(lastInitialPosition - 1, fullname.size()); + firstname = fullname.subList(0, lastInitialPosition - 1); + } + + } + return null; + } + + private List splitTermsFirstName(String s) { + List list = Lists.newArrayList(); + for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (s.trim().matches("\\p{Lu}{2,3}")) { + String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase) + for (String p : parts) { + if (p.length() > 0) + list.add(p); + } + } else { + list.add(part); + } + + } + return list; + } + + private List splitTerms(String s) { + if (particles == null) { + // particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); + } + + List list = Lists.newArrayList(); + for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + // if (!particles.contains(part.toLowerCase())) { + list.add(part); + + // } + } + return list; + } + + public List getFirstname() { + return firstname; + } + + public List getSurname() { + return surname; + } + + public List getFullname() { + return fullname; + } + + public String hash() { + return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString(); + } + + public String getNormalisedFullname() { + return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) + : Joiner.on(" ").join(fullname); + // return isAccurate() ? + // Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) : + // Joiner.on(" ").join(fullname); + } + + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, new Capitalize())); + } + + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations())); + } + + public boolean isAccurate() { + return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty()); + } + + @Override + public QName getName() { + return new QName(QNAME_BASE_URI + "/person", "person"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + // TODO Auto-generated method stub + return null; + } + + @Override + public XdmValue call(XdmValue[] arguments) throws SaxonApiException { + // TODO Auto-generated method stub + return null; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index 430fbcf95..f803c7cbc 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -46,6 +46,7 @@ public class XSLTTransformationFunction implements MapFunction { + + @Override + public String apply(String s) { + return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase()); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java new file mode 100644 index 000000000..01174bf04 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java @@ -0,0 +1,12 @@ + +package eu.dnetlib.dhp.transformation.xslt.utils; + +import com.google.common.base.Function; + +public class DotAbbreviations implements Function { + + @Override + public String apply(String s) { + return s.length() == 1 ? s + "." : s; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index f3a0685ac..a46245f6d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -92,15 +92,19 @@ public class TransformationJobTest extends AbstractVocabularyTest { } @Test - @DisplayName("Test Transform Inst.&Them.v4 record XML with xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid") - public void testTransformITGv4() throws Exception { + @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite/oaiOpenAIRE") + public void testTransformMostlyUsedScript() throws Exception { + + String xslTransformationScript = ""; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"; + // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); // We Load the XSLT transformation Rule from the classpath - XSLTTransformationFunction tr = loadTransformationRule( - "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript); MetadataRecord result = tr.call(mr); @@ -110,15 +114,18 @@ public class TransformationJobTest extends AbstractVocabularyTest { } @Test - @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite") - public void testTransformMostlyUsedScript() throws Exception { + @DisplayName("Test Transform record XML with xslt_cleaning_REST_OmicsDI") + public void testTransformRestScript() throws Exception { + + String xslTransformationScript = ""; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl"; + // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); - mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_omicsdi.xml"))); // We Load the XSLT transformation Rule from the classpath - XSLTTransformationFunction tr = loadTransformationRule( - "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"); + XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript); MetadataRecord result = tr.call(mr); diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml new file mode 100644 index 000000000..b068b89e3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml @@ -0,0 +1,60 @@ + + + + _____OmicsDI::0000337c02d1b51030675d69407655da + PRJNA78295 + 2020-10-31T15:31:30.725Z + _____OmicsDI + + + + 0.235294117647059 + 0 + null + 0.0 + 0 + Sedimentitalea nanhaiensis DSM 24252 Genome sequencing and assembly + 8.20101314054644E-5 + omics_ena_project + Sedimentitalea nanhaiensis DSM 24252 + 14 + 0 + null + Genomics + 0.0 + + 571166 + Sedimentitalea nanhaiensis DSM 24252 + + 0.0 + false + PRJNA78295 + null + 13 + + + + + + https%3A%2F%2Fwww.omicsdi.org%2Fws%2Fdataset%2Fsearch + + + + + + + false + false + 0.9 + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl new file mode 100644 index 000000000..4ac24183f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl @@ -0,0 +1,297 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + UNKNOWN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 3b694074ffab57fda01d9fc58b6d5bde9a847132 Mon Sep 17 00:00:00 2001 From: Andreas Czerniak Date: Fri, 9 Apr 2021 14:35:30 +0200 Subject: [PATCH 2/4] add xslt, personname cleaner --- .../transformation/xslt/PersonCleaner.java | 206 ++++++++++++ .../xslt/XSLTTransformationFunction.java | 1 + .../transformation/xslt/utils/Capitalize.java | 14 + .../xslt/utils/DotAbbreviations.java | 12 + .../transformation/TransformationJobTest.java | 25 +- .../dnetlib/dhp/transform/input_omicsdi.xml | 60 ++++ .../scripts/xslt_cleaning_REST_OmicsDI.xsl | 297 ++++++++++++++++++ 7 files changed, 606 insertions(+), 9 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java new file mode 100644 index 000000000..069060722 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java @@ -0,0 +1,206 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + +import java.io.Serializable; +// import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.List; +import java.util.Set; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.hash.Hashing; + +import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize; +import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations; +import net.sf.saxon.s9api.ExtensionFunction; +import net.sf.saxon.s9api.ItemType; +import net.sf.saxon.s9api.OccurrenceIndicator; +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.SequenceType; +import net.sf.saxon.s9api.XdmValue; + +//import eu.dnetlib.pace.clustering.NGramUtils; +//import eu.dnetlib.pace.util.Capitalise; +//import eu.dnetlib.pace.util.DotAbbreviations; + +public class PersonCleaner implements ExtensionFunction, Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + private List firstname = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + + private static Set particles = null; + + public PersonCleaner() { + + } + + public String normalize(String s) { + s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + +// s = s.replaceAll("[\\W&&[^,-]]", " "); + +// System.out.println("class Person: s: " + s); + +// s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " "); + s = s.replaceAll("[\\p{Punct}&&[^-,]]", " "); + s = s.replace("\\d", " "); + s = s.replace("\\n", " "); + s = s.replace("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (s.contains(",")) { + // System.out.println("class Person: s: " + s); + + String[] arr = s.split(","); + if (arr.length == 1) { + + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + firstname = splitTermsFirstName(arr[1]); +// System.out.println("class Person: surname: " + surname); +// System.out.println("class Person: firstname: " + firstname); + + fullname.addAll(surname); + fullname.addAll(firstname); + } + } else { + fullname = splitTerms(s); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + firstname = fullname.subList(0, lastInitialPosition + 1); + System.out.println("name: " + firstname); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + firstname.add(term); + } + } + } else if (lastInitialPosition == fullname.size()) { + surname = fullname.subList(lastInitialPosition - 1, fullname.size()); + firstname = fullname.subList(0, lastInitialPosition - 1); + } + + } + return null; + } + + private List splitTermsFirstName(String s) { + List list = Lists.newArrayList(); + for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (s.trim().matches("\\p{Lu}{2,3}")) { + String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase) + for (String p : parts) { + if (p.length() > 0) + list.add(p); + } + } else { + list.add(part); + } + + } + return list; + } + + private List splitTerms(String s) { + if (particles == null) { + // particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); + } + + List list = Lists.newArrayList(); + for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + // if (!particles.contains(part.toLowerCase())) { + list.add(part); + + // } + } + return list; + } + + public List getFirstname() { + return firstname; + } + + public List getSurname() { + return surname; + } + + public List getFullname() { + return fullname; + } + + public String hash() { + return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString(); + } + + public String getNormalisedFullname() { + return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) + : Joiner.on(" ").join(fullname); + // return isAccurate() ? + // Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) : + // Joiner.on(" ").join(fullname); + } + + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, new Capitalize())); + } + + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations())); + } + + public boolean isAccurate() { + return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty()); + } + + @Override + public QName getName() { + return new QName(QNAME_BASE_URI + "/person", "person"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + // TODO Auto-generated method stub + return null; + } + + @Override + public XdmValue call(XdmValue[] arguments) throws SaxonApiException { + // TODO Auto-generated method stub + return null; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index 430fbcf95..f803c7cbc 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -46,6 +46,7 @@ public class XSLTTransformationFunction implements MapFunction { + + @Override + public String apply(String s) { + return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase()); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java new file mode 100644 index 000000000..01174bf04 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java @@ -0,0 +1,12 @@ + +package eu.dnetlib.dhp.transformation.xslt.utils; + +import com.google.common.base.Function; + +public class DotAbbreviations implements Function { + + @Override + public String apply(String s) { + return s.length() == 1 ? s + "." : s; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index f3a0685ac..a46245f6d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -92,15 +92,19 @@ public class TransformationJobTest extends AbstractVocabularyTest { } @Test - @DisplayName("Test Transform Inst.&Them.v4 record XML with xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid") - public void testTransformITGv4() throws Exception { + @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite/oaiOpenAIRE") + public void testTransformMostlyUsedScript() throws Exception { + + String xslTransformationScript = ""; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"; + // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); // We Load the XSLT transformation Rule from the classpath - XSLTTransformationFunction tr = loadTransformationRule( - "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript); MetadataRecord result = tr.call(mr); @@ -110,15 +114,18 @@ public class TransformationJobTest extends AbstractVocabularyTest { } @Test - @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite") - public void testTransformMostlyUsedScript() throws Exception { + @DisplayName("Test Transform record XML with xslt_cleaning_REST_OmicsDI") + public void testTransformRestScript() throws Exception { + + String xslTransformationScript = ""; + xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl"; + // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); - mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_omicsdi.xml"))); // We Load the XSLT transformation Rule from the classpath - XSLTTransformationFunction tr = loadTransformationRule( - "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"); + XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript); MetadataRecord result = tr.call(mr); diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml new file mode 100644 index 000000000..b068b89e3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml @@ -0,0 +1,60 @@ + + + + _____OmicsDI::0000337c02d1b51030675d69407655da + PRJNA78295 + 2020-10-31T15:31:30.725Z + _____OmicsDI + + + + 0.235294117647059 + 0 + null + 0.0 + 0 + Sedimentitalea nanhaiensis DSM 24252 Genome sequencing and assembly + 8.20101314054644E-5 + omics_ena_project + Sedimentitalea nanhaiensis DSM 24252 + 14 + 0 + null + Genomics + 0.0 + + 571166 + Sedimentitalea nanhaiensis DSM 24252 + + 0.0 + false + PRJNA78295 + null + 13 + + + + + + https%3A%2F%2Fwww.omicsdi.org%2Fws%2Fdataset%2Fsearch + + + + + + + false + false + 0.9 + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl new file mode 100644 index 000000000..4ac24183f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl @@ -0,0 +1,297 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + UNKNOWN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From d7614c1f85418c69331687045004daa3b54713b4 Mon Sep 17 00:00:00 2001 From: Andreas Czerniak Date: Tue, 13 Apr 2021 07:04:20 +0200 Subject: [PATCH 3/4] introduce new const --- dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java index eb4cb91ed..108edad47 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java @@ -42,6 +42,8 @@ public class Constants { public static final String RETRY_DELAY = "retryDelay"; public static final String CONNECT_TIMEOUT = "connectTimeOut"; public static final String READ_TIMEOUT = "readTimeOut"; + public static final String FROM_DATE_OVERRIDE = "fromDateOverride"; + public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride"; public static final String CONTENT_TOTALITEMS = "TotalItems"; public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; From d1ca025b0bdbdff02901b4dd6514dce397d16ee3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 13 Apr 2021 14:32:41 +0200 Subject: [PATCH 4/4] [cleaning] remiving authors without fullname or providing 'deactivated' keyword. Removing test test titles --- .../dhp/schema/oaf/CleaningFunctions.java | 79 ++++++++++++++++--- .../oa/graph/clean/CleanGraphSparkJob.java | 1 + .../oa/graph/clean/CleaningFunctionTest.java | 4 + .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 ++++++ 4 files changed, 96 insertions(+), 10 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 917733a14..673bee314 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; @@ -22,6 +23,9 @@ public class CleaningFunctions { public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final Set PID_BLACKLIST = new HashSet<>(); + public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; + public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]"; + public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10; static { PID_BLACKLIST.add("none"); @@ -80,6 +84,36 @@ public class CleaningFunctions { return value; } + public static boolean filter(T value) { + if (value instanceof Datasource) { + // nothing to evaluate here + } else if (value instanceof Project) { + // nothing to evaluate here + } else if (value instanceof Organization) { + // nothing to evaluate here + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + + if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) { + return false; + } + + if (value instanceof Publication) { + + } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + return true; + } + public static T cleanup(T value) { if (value instanceof Datasource) { // nothing to clean here @@ -124,6 +158,12 @@ public class CleaningFunctions { .stream() .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter( + sp -> sp + .getValue() + .toLowerCase() + .replaceAll(TITLE_FILTER_REGEX, "") + .length() > TITLE_FILTER_RESIDUAL_LENGTH) .map(CleaningFunctions::cleanValue) .collect(Collectors.toList())); } @@ -199,16 +239,7 @@ public class CleaningFunctions { } } if (Objects.nonNull(r.getAuthor())) { - boolean nullRank = r - .getAuthor() - .stream() - .anyMatch(a -> Objects.isNull(a.getRank())); - if (nullRank) { - int i = 1; - for (Author author : r.getAuthor()) { - author.setRank(i++); - } - } + final List authors = Lists.newArrayList(); for (Author a : r.getAuthor()) { if (Objects.isNull(a.getPid())) { a.setPid(Lists.newArrayList()); @@ -235,8 +266,27 @@ public class CleaningFunctions { .stream() .collect(Collectors.toList())); } + if (StringUtils.isBlank(a.getFullname())) { + if (StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname())) { + a.setFullname(a.getSurname() + ", " + a.getName()); + } + } + if (StringUtils.isNotBlank(a.getFullname()) && isValidAuthorName(a)) { + authors.add(a); + } } + boolean nullRank = authors + .stream() + .anyMatch(a -> Objects.isNull(a.getRank())); + if (nullRank) { + int i = 1; + for (Author author : authors) { + author.setRank(i++); + } + } + r.setAuthor(authors); + } if (value instanceof Publication) { @@ -252,6 +302,15 @@ public class CleaningFunctions { return value; } + private static boolean isValidAuthorName(Author a) { + return !Stream + .of(a.getFullname(), a.getName(), a.getSurname()) + .filter(s -> s != null && !s.isEmpty()) + .collect(Collectors.joining("")) + .toLowerCase() + .matches(INVALID_AUTHOR_REGEX); + } + private static List processPidCleaning(List pids) { return pids .stream() diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 86c453656..088539325 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -90,6 +90,7 @@ public class CleanGraphSparkJob { .map((MapFunction) value -> fixVocabularyNames(value), Encoders.bean(clazz)) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) .map((MapFunction) value -> cleanup(value), Encoders.bean(clazz)) + .filter((FilterFunction) value -> filter(value)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index fdbc58c17..15cb054ad 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -67,6 +67,7 @@ public class CleaningFunctionTest { assertNotNull(p_out.getPublisher()); assertNull(p_out.getPublisher().getValue()); + assertEquals("und", p_out.getLanguage().getClassid()); assertEquals("Undetermined", p_out.getLanguage().getClassname()); @@ -120,6 +121,9 @@ public class CleaningFunctionTest { .isPresent()); Publication p_cleaned = CleaningFunctions.cleanup(p_out); + + assertEquals(1, p_cleaned.getTitle().size()); + assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid()); assertNull(p_out.getPublisher()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 23de2ef86..8670c10f1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -865,6 +865,28 @@ "schemename": "dnet:dataCite_title" }, "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "test test 123 test" } ] } \ No newline at end of file