refactoring of common utilities

This commit is contained in:
Claudio Atzori 2024-05-02 11:16:58 +02:00
parent dcf23b3d06
commit 66680b8b9a
9 changed files with 160 additions and 86 deletions

View File

@ -63,11 +63,13 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>edu.cmu</groupId>
<artifactId>dhp-pace-core</artifactId> <artifactId>secondstring</artifactId>
<version>${project.version}</version> </dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId> <artifactId>hadoop-common</artifactId>

View File

@ -0,0 +1,100 @@
package eu.dnetlib.pace.common;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Set of common functions for the framework
*
* @author claudio
*/
public class PaceCommonUtils {
// transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
});
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
} catch (Exception e) {
return s;
}
}
public static String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public static String unicodeNormalization(final String s) {
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
public static Set<String> loadFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
protected static Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
}

View File

@ -1,21 +1,20 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.List;
import java.util.Set;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import eu.dnetlib.pace.common.PaceCommonUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.util.Capitalise; import eu.dnetlib.pace.util.Capitalise;
import eu.dnetlib.pace.util.DotAbbreviations; import eu.dnetlib.pace.util.DotAbbreviations;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.List;
import java.util.Set;
public class Person { public class Person {
private static final String UTF8 = "UTF-8"; private static final String UTF8 = "UTF-8";
@ -86,7 +85,7 @@ public class Person {
private List<String> splitTerms(final String s) { private List<String> splitTerms(final String s) {
if (particles == null) { if (particles == null) {
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
} }
final List<String> list = Lists.newArrayList(); final List<String> list = Lists.newArrayList();

View File

@ -0,0 +1,17 @@
package eu.dnetlib.pace.util;
import com.google.common.base.Function;
import org.apache.commons.lang3.text.WordUtils;
public class Capitalise implements Function<String, String> {
private final char[] DELIM = {
' ', '-'
};
@Override
public String apply(final String s) {
return WordUtils.capitalize(s.toLowerCase(), DELIM);
}
}

View File

@ -0,0 +1,11 @@
package eu.dnetlib.pace.util;
import com.google.common.base.Function;
public class DotAbbreviations implements Function<String, String> {
@Override
public String apply(String s) {
return s.length() == 1 ? s + "." : s;
}
}

View File

@ -49,6 +49,12 @@
</build> </build>
<dependencies> <dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>edu.cmu</groupId> <groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId> <artifactId>secondstring</artifactId>

View File

@ -1,32 +1,26 @@
package eu.dnetlib.pace.common; package eu.dnetlib.pace.common;
import com.google.common.base.Joiner;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
/** /**
* Set of common functions for the framework * Set of common functions for the framework
* *
* @author claudio * @author claudio
*/ */
public class AbstractPaceFunctions { public class AbstractPaceFunctions extends PaceCommonUtils {
// city map to be used when translating the city names into codes // city map to be used when translating the city names into codes
private static Map<String, String> cityMap = AbstractPaceFunctions private static Map<String, String> cityMap = AbstractPaceFunctions
@ -41,9 +35,6 @@ public class AbstractPaceFunctions {
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
// transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
// blacklist of ngrams: to avoid generic keys // blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
@ -51,8 +42,6 @@ public class AbstractPaceFunctions {
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>"); public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
// doi prefix for normalization // doi prefix for normalization
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"); public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
@ -129,25 +118,6 @@ public class AbstractPaceFunctions {
return numberPattern.matcher(strNum).matches(); return numberPattern.matcher(strNum).matches();
} }
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
});
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
} catch (Exception e) {
return s;
}
}
protected static String removeSymbols(final String s) { protected static String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
@ -162,23 +132,6 @@ public class AbstractPaceFunctions {
return s != null; return s != null;
} }
public static String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public static String utf8(final String s) { public static String utf8(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8); byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
return new String(bytes, StandardCharsets.UTF_8); return new String(bytes, StandardCharsets.UTF_8);
@ -233,22 +186,6 @@ public class AbstractPaceFunctions {
return newset; return newset;
} }
public static Set<String> loadFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
public static Map<String, String> loadMapFromClasspath(final String classpath) { public static Map<String, String> loadMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng"); Transliterator transliterator = Transliterator.getInstance("Any-Eng");
@ -303,10 +240,6 @@ public class AbstractPaceFunctions {
return StringUtils.substring(s, 0, 1).toLowerCase(); return StringUtils.substring(s, 0, 1).toLowerCase();
} }
protected static Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
public static String normalizePid(String pid) { public static String normalizePid(String pid) {
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll(""); return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
} }

View File

@ -90,6 +90,12 @@
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-pace-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>