forked from D-Net/dnet-hadoop
refactoring of common utilities
This commit is contained in:
parent
dcf23b3d06
commit
66680b8b9a
|
@ -63,11 +63,13 @@
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>edu.cmu</groupId>
|
||||||
<artifactId>dhp-pace-core</artifactId>
|
<artifactId>secondstring</artifactId>
|
||||||
<version>${project.version}</version>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.ibm.icu</groupId>
|
||||||
|
<artifactId>icu4j</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-common</artifactId>
|
<artifactId>hadoop-common</artifactId>
|
||||||
|
|
|
@ -0,0 +1,100 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
|
import com.google.common.base.Splitter;
|
||||||
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set of common functions for the framework
|
||||||
|
*
|
||||||
|
* @author claudio
|
||||||
|
*/
|
||||||
|
public class PaceCommonUtils {
|
||||||
|
|
||||||
|
// transliterator
|
||||||
|
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
|
||||||
|
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||||
|
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||||
|
|
||||||
|
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||||
|
|
||||||
|
protected static String fixAliases(final String s) {
|
||||||
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
s.chars().forEach(ch -> {
|
||||||
|
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||||
|
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||||
|
});
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static String transliterate(final String s) {
|
||||||
|
try {
|
||||||
|
return transliterator.transliterate(s);
|
||||||
|
} catch (Exception e) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String normalize(final String s) {
|
||||||
|
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||||
|
.toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||||
|
// strings
|
||||||
|
.replaceAll("[^ \\w]+", "")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String unicodeNormalization(final String s) {
|
||||||
|
|
||||||
|
Matcher m = hexUnicodePattern.matcher(s);
|
||||||
|
StringBuffer buf = new StringBuffer(s.length());
|
||||||
|
while (m.find()) {
|
||||||
|
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||||
|
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||||
|
}
|
||||||
|
m.appendTail(buf);
|
||||||
|
return buf.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<String> loadFromClasspath(final String classpath) {
|
||||||
|
|
||||||
|
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
|
||||||
|
final Set<String> h = Sets.newHashSet();
|
||||||
|
try {
|
||||||
|
for (final String s : IOUtils
|
||||||
|
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||||
|
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||||
|
}
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
return Sets.newHashSet();
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||||
|
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,21 +1,20 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.text.Normalizer;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
|
import eu.dnetlib.pace.common.PaceCommonUtils;
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
|
||||||
import eu.dnetlib.pace.util.Capitalise;
|
import eu.dnetlib.pace.util.Capitalise;
|
||||||
import eu.dnetlib.pace.util.DotAbbreviations;
|
import eu.dnetlib.pace.util.DotAbbreviations;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
public class Person {
|
public class Person {
|
||||||
|
|
||||||
private static final String UTF8 = "UTF-8";
|
private static final String UTF8 = "UTF-8";
|
||||||
|
@ -86,7 +85,7 @@ public class Person {
|
||||||
|
|
||||||
private List<String> splitTerms(final String s) {
|
private List<String> splitTerms(final String s) {
|
||||||
if (particles == null) {
|
if (particles == null) {
|
||||||
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<String> list = Lists.newArrayList();
|
final List<String> list = Lists.newArrayList();
|
|
@ -0,0 +1,17 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
|
import com.google.common.base.Function;
|
||||||
|
import org.apache.commons.lang3.text.WordUtils;
|
||||||
|
|
||||||
|
public class Capitalise implements Function<String, String> {
|
||||||
|
|
||||||
|
private final char[] DELIM = {
|
||||||
|
' ', '-'
|
||||||
|
};
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String apply(final String s) {
|
||||||
|
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,11 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
|
import com.google.common.base.Function;
|
||||||
|
|
||||||
|
public class DotAbbreviations implements Function<String, String> {
|
||||||
|
@Override
|
||||||
|
public String apply(String s) {
|
||||||
|
return s.length() == 1 ? s + "." : s;
|
||||||
|
}
|
||||||
|
}
|
|
@ -49,6 +49,12 @@
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>edu.cmu</groupId>
|
<groupId>edu.cmu</groupId>
|
||||||
<artifactId>secondstring</artifactId>
|
<artifactId>secondstring</artifactId>
|
||||||
|
|
|
@ -1,32 +1,26 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.common;
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.Normalizer;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
|
||||||
import com.google.common.base.Splitter;
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import com.ibm.icu.text.Transliterator;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of common functions for the framework
|
* Set of common functions for the framework
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
public class AbstractPaceFunctions {
|
public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
|
|
||||||
// city map to be used when translating the city names into codes
|
// city map to be used when translating the city names into codes
|
||||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||||
|
@ -41,9 +35,6 @@ public class AbstractPaceFunctions {
|
||||||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||||
|
|
||||||
// transliterator
|
|
||||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
|
||||||
|
|
||||||
// blacklist of ngrams: to avoid generic keys
|
// blacklist of ngrams: to avoid generic keys
|
||||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||||
|
|
||||||
|
@ -51,8 +42,6 @@ public class AbstractPaceFunctions {
|
||||||
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
||||||
|
|
||||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
|
||||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
|
||||||
|
|
||||||
// doi prefix for normalization
|
// doi prefix for normalization
|
||||||
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
||||||
|
@ -129,25 +118,6 @@ public class AbstractPaceFunctions {
|
||||||
return numberPattern.matcher(strNum).matches();
|
return numberPattern.matcher(strNum).matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static String fixAliases(final String s) {
|
|
||||||
final StringBuilder sb = new StringBuilder();
|
|
||||||
|
|
||||||
s.chars().forEach(ch -> {
|
|
||||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
|
||||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
|
||||||
});
|
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static String transliterate(final String s) {
|
|
||||||
try {
|
|
||||||
return transliterator.transliterate(s);
|
|
||||||
} catch (Exception e) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static String removeSymbols(final String s) {
|
protected static String removeSymbols(final String s) {
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
@ -162,23 +132,6 @@ public class AbstractPaceFunctions {
|
||||||
return s != null;
|
return s != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String normalize(final String s) {
|
|
||||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
|
||||||
.toLowerCase()
|
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
|
||||||
// strings
|
|
||||||
.replaceAll("[^ \\w]+", "")
|
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
|
||||||
.replaceAll("(\\d)+", " ")
|
|
||||||
.replaceAll("(\\n)+", " ")
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String nfd(final String s) {
|
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String utf8(final String s) {
|
public static String utf8(final String s) {
|
||||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||||
return new String(bytes, StandardCharsets.UTF_8);
|
return new String(bytes, StandardCharsets.UTF_8);
|
||||||
|
@ -233,22 +186,6 @@ public class AbstractPaceFunctions {
|
||||||
return newset;
|
return newset;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Set<String> loadFromClasspath(final String classpath) {
|
|
||||||
|
|
||||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
|
||||||
|
|
||||||
final Set<String> h = Sets.newHashSet();
|
|
||||||
try {
|
|
||||||
for (final String s : IOUtils
|
|
||||||
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
|
||||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
|
||||||
}
|
|
||||||
} catch (final Throwable e) {
|
|
||||||
return Sets.newHashSet();
|
|
||||||
}
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||||
|
|
||||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
@ -303,10 +240,6 @@ public class AbstractPaceFunctions {
|
||||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
|
||||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String normalizePid(String pid) {
|
public static String normalizePid(String pid) {
|
||||||
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
|
@ -90,6 +90,12 @@
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-pace-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
|
|
Loading…
Reference in New Issue