From 66680b8b9a69a2801016ee4a9b34f872ce6a766f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 2 May 2024 11:16:58 +0200 Subject: [PATCH] refactoring of common utilities --- dhp-common/pom.xml | 10 +- .../dnetlib/pace/common/PaceCommonUtils.java | 100 ++++++++++++++++++ .../java/eu/dnetlib/pace/model/Person.java | 15 ++- .../java/eu/dnetlib/pace/util/Capitalise.java | 17 +++ .../dnetlib/pace/util/DotAbbreviations.java | 11 ++ .../eu/dnetlib/pace/config/name_particles.txt | 0 dhp-pace-core/pom.xml | 6 ++ .../pace/common/AbstractPaceFunctions.java | 81 ++------------ dhp-workflows/dhp-graph-mapper/pom.xml | 6 ++ 9 files changed, 160 insertions(+), 86 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java rename {dhp-pace-core => dhp-common}/src/main/java/eu/dnetlib/pace/model/Person.java (96%) create mode 100644 dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java rename {dhp-pace-core => dhp-common}/src/main/resources/eu/dnetlib/pace/config/name_particles.txt (100%) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 692d2bdc3..04735876d 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -63,11 +63,13 @@ - eu.dnetlib.dhp - dhp-pace-core - ${project.version} + edu.cmu + secondstring + + + com.ibm.icu + icu4j - org.apache.hadoop hadoop-common diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java b/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java new file mode 100644 index 000000000..a279271b5 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java @@ -0,0 +1,100 @@ + +package eu.dnetlib.pace.common; + +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import com.ibm.icu.text.Transliterator; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; + +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Set of common functions for the framework + * + * @author claudio + */ +public class PaceCommonUtils { + + // transliterator + protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng"); + + protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; + protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; + + protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); + + protected static String fixAliases(final String s) { + final StringBuilder sb = new StringBuilder(); + + s.chars().forEach(ch -> { + final int i = StringUtils.indexOf(aliases_from, ch); + sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch); + }); + + return sb.toString(); + } + + protected static String transliterate(final String s) { + try { + return transliterator.transliterate(s); + } catch (Exception e) { + return s; + } + } + + public static String normalize(final String s) { + return fixAliases(transliterate(nfd(unicodeNormalization(s)))) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input + // strings + .replaceAll("[^ \\w]+", "") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + public static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + + public static String unicodeNormalization(final String s) { + + Matcher m = hexUnicodePattern.matcher(s); + StringBuffer buf = new StringBuffer(s.length()); + while (m.find()) { + String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16)); + m.appendReplacement(buf, Matcher.quoteReplacement(ch)); + } + m.appendTail(buf); + return buf.toString(); + } + + public static Set loadFromClasspath(final String classpath) { + + Transliterator transliterator = Transliterator.getInstance("Any-Eng"); + + final Set h = Sets.newHashSet(); + try { + for (final String s : IOUtils + .readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) { + h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords + } + } catch (final Throwable e) { + return Sets.newHashSet(); + } + return h; + } + + protected static Iterable tokens(final String s, final int maxTokens) { + return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); + } + +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java similarity index 96% rename from dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java rename to dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java index 96120cf4d..c95c9d823 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java +++ b/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java @@ -1,21 +1,20 @@ package eu.dnetlib.pace.model; -import java.nio.charset.Charset; -import java.text.Normalizer; -import java.util.List; -import java.util.Set; - import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.hash.Hashing; - -import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.common.PaceCommonUtils; import eu.dnetlib.pace.util.Capitalise; import eu.dnetlib.pace.util.DotAbbreviations; +import java.nio.charset.Charset; +import java.text.Normalizer; +import java.util.List; +import java.util.Set; + public class Person { private static final String UTF8 = "UTF-8"; @@ -86,7 +85,7 @@ public class Person { private List splitTerms(final String s) { if (particles == null) { - particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); + particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); } final List list = Lists.newArrayList(); diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java b/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java new file mode 100644 index 000000000..015386423 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java @@ -0,0 +1,17 @@ + +package eu.dnetlib.pace.util; + +import com.google.common.base.Function; +import org.apache.commons.lang3.text.WordUtils; + +public class Capitalise implements Function { + + private final char[] DELIM = { + ' ', '-' + }; + + @Override + public String apply(final String s) { + return WordUtils.capitalize(s.toLowerCase(), DELIM); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java b/dhp-common/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java new file mode 100644 index 000000000..2c89da4db --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java @@ -0,0 +1,11 @@ + +package eu.dnetlib.pace.util; + +import com.google.common.base.Function; + +public class DotAbbreviations implements Function { + @Override + public String apply(String s) { + return s.length() == 1 ? s + "." : s; + } +} diff --git a/dhp-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt b/dhp-common/src/main/resources/eu/dnetlib/pace/config/name_particles.txt similarity index 100% rename from dhp-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt rename to dhp-common/src/main/resources/eu/dnetlib/pace/config/name_particles.txt diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml index 7b384f109..1593575d2 100644 --- a/dhp-pace-core/pom.xml +++ b/dhp-pace-core/pom.xml @@ -49,6 +49,12 @@ + + eu.dnetlib.dhp + dhp-common + ${project.version} + + edu.cmu secondstring diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index ba7639ada..6bfb8b3f4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -1,32 +1,26 @@ package eu.dnetlib.pace.common; +import com.google.common.base.Joiner; +import com.google.common.collect.Sets; +import com.ibm.icu.text.Transliterator; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; + import java.io.IOException; import java.io.StringWriter; import java.nio.charset.StandardCharsets; -import java.text.Normalizer; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; - -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import com.ibm.icu.text.Transliterator; - -import eu.dnetlib.pace.clustering.NGramUtils; - /** * Set of common functions for the framework * * @author claudio */ -public class AbstractPaceFunctions { +public class AbstractPaceFunctions extends PaceCommonUtils { // city map to be used when translating the city names into codes private static Map cityMap = AbstractPaceFunctions @@ -41,9 +35,6 @@ public class AbstractPaceFunctions { protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); - // transliterator - protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng"); - // blacklist of ngrams: to avoid generic keys protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); @@ -51,8 +42,6 @@ public class AbstractPaceFunctions { public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>"); private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; - private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; - private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; // doi prefix for normalization public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"); @@ -129,25 +118,6 @@ public class AbstractPaceFunctions { return numberPattern.matcher(strNum).matches(); } - protected static String fixAliases(final String s) { - final StringBuilder sb = new StringBuilder(); - - s.chars().forEach(ch -> { - final int i = StringUtils.indexOf(aliases_from, ch); - sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch); - }); - - return sb.toString(); - } - - protected static String transliterate(final String s) { - try { - return transliterator.transliterate(s); - } catch (Exception e) { - return s; - } - } - protected static String removeSymbols(final String s) { final StringBuilder sb = new StringBuilder(); @@ -162,23 +132,6 @@ public class AbstractPaceFunctions { return s != null; } - public static String normalize(final String s) { - return fixAliases(transliterate(nfd(unicodeNormalization(s)))) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input - // strings - .replaceAll("[^ \\w]+", "") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } - - public static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } - public static String utf8(final String s) { byte[] bytes = s.getBytes(StandardCharsets.UTF_8); return new String(bytes, StandardCharsets.UTF_8); @@ -233,22 +186,6 @@ public class AbstractPaceFunctions { return newset; } - public static Set loadFromClasspath(final String classpath) { - - Transliterator transliterator = Transliterator.getInstance("Any-Eng"); - - final Set h = Sets.newHashSet(); - try { - for (final String s : IOUtils - .readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) { - h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords - } - } catch (final Throwable e) { - return Sets.newHashSet(); - } - return h; - } - public static Map loadMapFromClasspath(final String classpath) { Transliterator transliterator = Transliterator.getInstance("Any-Eng"); @@ -303,10 +240,6 @@ public class AbstractPaceFunctions { return StringUtils.substring(s, 0, 1).toLowerCase(); } - protected static Iterable tokens(final String s, final int maxTokens) { - return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); - } - public static String normalizePid(String pid) { return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll(""); } diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index c7ac55ef6..2c93bab83 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -90,6 +90,12 @@ ${project.version} + + eu.dnetlib.dhp + dhp-pace-core + ${project.version} + + com.jayway.jsonpath json-path