forked from D-Net/dnet-hadoop
refactoring of common utilities
This commit is contained in:
parent
dcf23b3d06
commit
66680b8b9a
|
@ -63,11 +63,13 @@
|
|||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-pace-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
|
||||
package eu.dnetlib.pace.common;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public class PaceCommonUtils {
|
||||
|
||||
// transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
s.chars().forEach(ch -> {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||
});
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
} catch (Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
public static String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||
// strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public static String unicodeNormalization(final String s) {
|
||||
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils
|
||||
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,21 +1,20 @@
|
|||
|
||||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.common.PaceCommonUtils;
|
||||
import eu.dnetlib.pace.util.Capitalise;
|
||||
import eu.dnetlib.pace.util.DotAbbreviations;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class Person {
|
||||
|
||||
private static final String UTF8 = "UTF-8";
|
||||
|
@ -86,7 +85,7 @@ public class Person {
|
|||
|
||||
private List<String> splitTerms(final String s) {
|
||||
if (particles == null) {
|
||||
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
}
|
||||
|
||||
final List<String> list = Lists.newArrayList();
|
|
@ -0,0 +1,17 @@
|
|||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
public class Capitalise implements Function<String, String> {
|
||||
|
||||
private final char[] DELIM = {
|
||||
' ', '-'
|
||||
};
|
||||
|
||||
@Override
|
||||
public String apply(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
|
||||
public class DotAbbreviations implements Function<String, String> {
|
||||
@Override
|
||||
public String apply(String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
}
|
|
@ -49,6 +49,12 @@
|
|||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
|
|
|
@ -1,32 +1,26 @@
|
|||
|
||||
package eu.dnetlib.pace.common;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
|
||||
/**
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public class AbstractPaceFunctions {
|
||||
public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||
|
||||
// city map to be used when translating the city names into codes
|
||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||
|
@ -41,9 +35,6 @@ public class AbstractPaceFunctions {
|
|||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
|
||||
// transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
// blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
|
||||
|
@ -51,8 +42,6 @@ public class AbstractPaceFunctions {
|
|||
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
||||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
// doi prefix for normalization
|
||||
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
||||
|
@ -129,25 +118,6 @@ public class AbstractPaceFunctions {
|
|||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
s.chars().forEach(ch -> {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||
});
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
} catch (Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
protected static String removeSymbols(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
|
@ -162,23 +132,6 @@ public class AbstractPaceFunctions {
|
|||
return s != null;
|
||||
}
|
||||
|
||||
public static String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||
// strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public static String utf8(final String s) {
|
||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
|
@ -233,22 +186,6 @@ public class AbstractPaceFunctions {
|
|||
return newset;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils
|
||||
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
@ -303,10 +240,6 @@ public class AbstractPaceFunctions {
|
|||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
|
||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
|
||||
public static String normalizePid(String pid) {
|
||||
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
||||
}
|
||||
|
|
|
@ -90,6 +90,12 @@
|
|||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-pace-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
|
|
Loading…
Reference in New Issue