forked from D-Net/dnet-hadoop
upgraded maven version of commons-lang
This commit is contained in:
parent
5c8f6febee
commit
46727f5c76
|
@ -27,17 +27,14 @@
|
|||
<artifactId>gson</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-lang</groupId>
|
||||
<artifactId>commons-lang</artifactId>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-collections</groupId>
|
||||
<artifactId>commons-collections</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.pace.clustering;
|
|||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
|
|
|
@ -6,7 +6,7 @@ import java.util.Map;
|
|||
import com.google.common.base.Predicate;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.pace.clustering;
|
|||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
|
|
@ -8,7 +8,7 @@ import com.google.common.collect.Lists;
|
|||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@ClusteringClass("lowercase")
|
||||
public class LowercaseClustering extends AbstractClusteringFunction {
|
||||
|
|
|
@ -2,7 +2,7 @@ package eu.dnetlib.pace.clustering;
|
|||
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
|||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
|
|
@ -5,8 +5,8 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang.RandomStringUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.RandomStringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
|
|
|
@ -9,9 +9,8 @@ import eu.dnetlib.pace.clustering.NGramUtils;
|
|||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.FieldListImpl;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
|
@ -25,295 +24,293 @@ import java.util.stream.Collectors;
|
|||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
*/
|
||||
public abstract class AbstractPaceFunctions {
|
||||
|
||||
//city map to be used when translating the city names into codes
|
||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
//city map to be used when translating the city names into codes
|
||||
private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
//list of stopwords in different languages
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
||||
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
||||
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
|
||||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
//list of stopwords in different languages
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
||||
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
||||
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
|
||||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
|
||||
//blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
//blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
//doi prefix for normalization
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
//doi prefix for normalization
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||
|
||||
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
|
||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||
|
||||
protected String concat(final List<String> l) {
|
||||
return Joiner.on(" ").skipNulls().join(l);
|
||||
}
|
||||
protected String concat(final List<String> l) {
|
||||
return Joiner.on(" ").skipNulls().join(l);
|
||||
}
|
||||
|
||||
protected String cleanup(final String s) {
|
||||
final String s0 = unicodeNormalization(s.toLowerCase());
|
||||
final String s1 = fixAliases(s0);
|
||||
final String s2 = nfd(s1);
|
||||
final String s3 = s2.replaceAll("–", " ");
|
||||
final String s4 = s3.replaceAll("&", " ");
|
||||
final String s5 = s4.replaceAll(""", " ");
|
||||
final String s6 = s5.replaceAll("−", " ");
|
||||
final String s7 = s6.replaceAll("([0-9]+)", " $1 ");
|
||||
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
|
||||
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
|
||||
final String s10 = s9.replaceAll("\\n", " ");
|
||||
final String s11 = s10.replaceAll("(?m)\\s+", " ");
|
||||
final String s12 = s11.trim();
|
||||
return s12;
|
||||
}
|
||||
protected String cleanup(final String s) {
|
||||
final String s0 = unicodeNormalization(s.toLowerCase());
|
||||
final String s1 = fixAliases(s0);
|
||||
final String s2 = nfd(s1);
|
||||
final String s3 = s2.replaceAll("–", " ");
|
||||
final String s4 = s3.replaceAll("&", " ");
|
||||
final String s5 = s4.replaceAll(""", " ");
|
||||
final String s6 = s5.replaceAll("−", " ");
|
||||
final String s7 = s6.replaceAll("([0-9]+)", " $1 ");
|
||||
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
|
||||
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
|
||||
final String s10 = s9.replaceAll("\\n", " ");
|
||||
final String s11 = s10.replaceAll("(?m)\\s+", " ");
|
||||
final String s12 = s11.trim();
|
||||
return s12;
|
||||
}
|
||||
|
||||
protected boolean checkNumbers(final String a, final String b) {
|
||||
final String numbersA = getNumbers(a);
|
||||
final String numbersB = getNumbers(b);
|
||||
final String romansA = getRomans(a);
|
||||
final String romansB = getRomans(b);
|
||||
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
|
||||
}
|
||||
protected boolean checkNumbers(final String a, final String b) {
|
||||
final String numbersA = getNumbers(a);
|
||||
final String numbersB = getNumbers(b);
|
||||
final String romansA = getRomans(a);
|
||||
final String romansB = getRomans(b);
|
||||
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
|
||||
}
|
||||
|
||||
protected String getRomans(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isRoman(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
protected String getRomans(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isRoman(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected boolean isRoman(final String s) {
|
||||
return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
|
||||
}
|
||||
protected boolean isRoman(final String s) {
|
||||
return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
|
||||
}
|
||||
|
||||
protected String getNumbers(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isNumber(t)? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
protected String getNumbers(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isNumber(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public boolean isNumber(String strNum) {
|
||||
if (strNum == null) {
|
||||
return false;
|
||||
}
|
||||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
public boolean isNumber(String strNum) {
|
||||
if (strNum == null) {
|
||||
return false;
|
||||
}
|
||||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected String removeSymbols(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
protected String removeSymbols(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
|
||||
}
|
||||
return sb.toString().replaceAll("\\s+", " ");
|
||||
}
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
|
||||
}
|
||||
return sb.toString().replaceAll("\\s+", " ");
|
||||
}
|
||||
|
||||
protected String getFirstValue(final Field values) {
|
||||
return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : "";
|
||||
}
|
||||
protected String getFirstValue(final Field values) {
|
||||
return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : "";
|
||||
}
|
||||
|
||||
protected boolean notNull(final String s) {
|
||||
return s != null;
|
||||
}
|
||||
protected boolean notNull(final String s) {
|
||||
return s != null;
|
||||
}
|
||||
|
||||
protected String normalize(final String s) {
|
||||
return nfd(unicodeNormalization(s))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
protected String normalize(final String s) {
|
||||
return nfd(unicodeNormalization(s))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
public String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public String unicodeNormalization(final String s) {
|
||||
public String unicodeNormalization(final String s) {
|
||||
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
while (st.hasMoreTokens()) {
|
||||
final String token = st.nextToken();
|
||||
if (!stopwords.contains(token)) {
|
||||
sb.append(token);
|
||||
sb.append(" ");
|
||||
}
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
while (st.hasMoreTokens()) {
|
||||
final String token = st.nextToken();
|
||||
if (!stopwords.contains(token)) {
|
||||
sb.append(token);
|
||||
sb.append(" ");
|
||||
}
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
public String filterAllStopWords(String s) {
|
||||
public String filterAllStopWords(String s) {
|
||||
|
||||
s = filterStopWords(s, stopwords_en);
|
||||
s = filterStopWords(s, stopwords_de);
|
||||
s = filterStopWords(s, stopwords_it);
|
||||
s = filterStopWords(s, stopwords_fr);
|
||||
s = filterStopWords(s, stopwords_pt);
|
||||
s = filterStopWords(s, stopwords_es);
|
||||
s = filterStopWords(s, stopwords_en);
|
||||
s = filterStopWords(s, stopwords_de);
|
||||
s = filterStopWords(s, stopwords_it);
|
||||
s = filterStopWords(s, stopwords_fr);
|
||||
s = filterStopWords(s, stopwords_pt);
|
||||
s = filterStopWords(s, stopwords_es);
|
||||
|
||||
return s;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
||||
final Set<String> newset = Sets.newLinkedHashSet();
|
||||
for (final String s : set) {
|
||||
if (!ngramBlacklist.contains(s)) {
|
||||
newset.add(s);
|
||||
}
|
||||
}
|
||||
return newset;
|
||||
}
|
||||
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
||||
final Set<String> newset = Sets.newLinkedHashSet();
|
||||
for (final String s : set) {
|
||||
if (!ngramBlacklist.contains(s)) {
|
||||
newset.add(s);
|
||||
}
|
||||
}
|
||||
return newset;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
|
||||
h.add(s);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
|
||||
h.add(s);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s: IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
|
||||
//string is like this: code;word1;word2;word3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (int i=1; i<line.length;i++){
|
||||
m.put(line[i].toLowerCase(),value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e){
|
||||
return new HashMap<>();
|
||||
}
|
||||
return m;
|
||||
}
|
||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
|
||||
//string is like this: code;word1;word2;word3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (int i = 1; i < line.length; i++) {
|
||||
m.put(line[i].toLowerCase(), value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
public String removeKeywords(String s, Set<String> keywords) {
|
||||
public String removeKeywords(String s, Set<String> keywords) {
|
||||
|
||||
s = " " + s + " ";
|
||||
for (String k: keywords ) {
|
||||
s = s.replaceAll(k.toLowerCase(), "");
|
||||
}
|
||||
s = " " + s + " ";
|
||||
for (String k : keywords) {
|
||||
s = s.replaceAll(k.toLowerCase(), "");
|
||||
}
|
||||
|
||||
return s.trim();
|
||||
}
|
||||
return s.trim();
|
||||
}
|
||||
|
||||
public double commonElementsPercentage(Set<String> s1, Set<String> s2){
|
||||
public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
|
||||
|
||||
int longer = (s1.size()>s2.size())?s1.size():s2.size();
|
||||
double longer = Math.max(s1.size(), s2.size());
|
||||
return (double) s1.stream().filter(s2::contains).count() / longer;
|
||||
}
|
||||
|
||||
return (double)CollectionUtils.intersection(s1,s2).size()/(double)longer;
|
||||
}
|
||||
//convert the set of keywords to codes
|
||||
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
//convert the set of keywords to codes
|
||||
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||
}
|
||||
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return toCodes(keywords, translationMap);
|
||||
}
|
||||
|
||||
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return toCodes(keywords, translationMap);
|
||||
}
|
||||
public Set<String> citiesToCodes(Set<String> keywords) {
|
||||
return toCodes(keywords, cityMap);
|
||||
}
|
||||
|
||||
public Set<String> citiesToCodes(Set<String> keywords) {
|
||||
return toCodes(keywords, cityMap);
|
||||
}
|
||||
protected String firstLC(final String s) {
|
||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
|
||||
protected String firstLC(final String s) {
|
||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
protected Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
|
||||
protected Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
public String normalizePid(String pid) {
|
||||
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
|
||||
}
|
||||
|
||||
public String normalizePid(String pid) {
|
||||
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
|
||||
}
|
||||
//get the list of keywords into the input string
|
||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
|
||||
|
||||
//get the list of keywords into the input string
|
||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
|
||||
String s = s1;
|
||||
|
||||
String s = s1;
|
||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||
|
||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||
Set<String> codes = new HashSet<>();
|
||||
|
||||
Set<String> codes = new HashSet<>();
|
||||
if (tokens.size() < windowSize)
|
||||
windowSize = tokens.size();
|
||||
|
||||
if (tokens.size()<windowSize)
|
||||
windowSize = tokens.size();
|
||||
int length = windowSize;
|
||||
|
||||
int length = windowSize;
|
||||
while (length != 0) {
|
||||
|
||||
while (length != 0) {
|
||||
for (int i = 0; i <= tokens.size() - length; i++) {
|
||||
String candidate = concat(tokens.subList(i, i + length));
|
||||
if (translationMap.containsKey(candidate)) {
|
||||
codes.add(candidate);
|
||||
s = s.replace(candidate, "").trim();
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i<=tokens.size()-length; i++){
|
||||
String candidate = concat(tokens.subList(i, i + length));
|
||||
if (translationMap.containsKey(candidate)) {
|
||||
codes.add(candidate);
|
||||
s = s.replace(candidate, "").trim();
|
||||
}
|
||||
}
|
||||
tokens = Arrays.asList(s.split(" "));
|
||||
length -= 1;
|
||||
}
|
||||
|
||||
tokens = Arrays.asList(s.split(" "));
|
||||
length-=1;
|
||||
}
|
||||
return codes;
|
||||
}
|
||||
|
||||
return codes;
|
||||
}
|
||||
public Set<String> getCities(String s1, int windowSize) {
|
||||
return getKeywords(s1, cityMap, windowSize);
|
||||
}
|
||||
|
||||
public Set<String> getCities(String s1, int windowSize) {
|
||||
return getKeywords(s1, cityMap, windowSize);
|
||||
}
|
||||
|
||||
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -103,7 +104,11 @@ public class FieldDef implements Serializable {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (JsonProcessingException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -283,7 +283,12 @@ public class FieldListImpl extends AbstractField implements FieldList {
|
|||
case String:
|
||||
return Joiner.on(" ").join(stringList());
|
||||
case JSON:
|
||||
final String json = new Gson().toJson(stringList());
|
||||
String json;
|
||||
try {
|
||||
json = new ObjectMapper().writeValueAsString(this);
|
||||
} catch (JsonProcessingException e) {
|
||||
json = null;
|
||||
}
|
||||
return json;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown type: " + getType().toString());
|
||||
|
|
|
@ -2,12 +2,12 @@ package eu.dnetlib.pace.model;
|
|||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import org.apache.commons.collections.iterators.SingletonIterator;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
/**
|
||||
* The Class FieldValueImpl.
|
||||
|
@ -124,7 +124,7 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
|
|||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public Iterator<Field> iterator() {
|
||||
return new SingletonIterator(this);
|
||||
return Collections.singleton((Field) this).iterator();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,129 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Ordering;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class Author implements Comparable<Author> {
|
||||
|
||||
private String id;
|
||||
private String fullname;
|
||||
private String firstname;
|
||||
private String secondnames;
|
||||
|
||||
private List<Match> matches = Lists.newArrayList();
|
||||
private Set<Author> coauthors = Sets.newHashSet();
|
||||
private SubjectsMap subjectsMap = new SubjectsMap();
|
||||
|
||||
public Author() {
|
||||
super();
|
||||
}
|
||||
|
||||
public Author(final Author a) {
|
||||
this.id = a.getId();
|
||||
this.fullname = a.getFullname();
|
||||
this.firstname = a.getFirstname();
|
||||
this.secondnames = a.getSecondnames();
|
||||
|
||||
this.matches = a.getMatches();
|
||||
this.coauthors = a.getCoauthors();
|
||||
this.subjectsMap = a.getSubjectsMap();
|
||||
}
|
||||
|
||||
public boolean hasMatches() {
|
||||
return (getMatches() != null) && !getMatches().isEmpty();
|
||||
}
|
||||
|
||||
public boolean hasCoauthors() {
|
||||
return (getCoauthors() != null) && !getCoauthors().isEmpty();
|
||||
}
|
||||
|
||||
public boolean isWellFormed() {
|
||||
return StringUtils.isNotBlank(getSecondnames()) && StringUtils.isNotBlank(getFirstname());
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getFullname() {
|
||||
return fullname;
|
||||
}
|
||||
|
||||
public void setFullname(final String fullname) {
|
||||
this.fullname = fullname;
|
||||
}
|
||||
|
||||
public String getFirstname() {
|
||||
return firstname;
|
||||
}
|
||||
|
||||
public void setFirstname(final String firstname) {
|
||||
this.firstname = firstname;
|
||||
}
|
||||
|
||||
public String getSecondnames() {
|
||||
return secondnames;
|
||||
}
|
||||
|
||||
public void setSecondnames(final String secondnames) {
|
||||
this.secondnames = secondnames;
|
||||
}
|
||||
|
||||
public List<Match> getMatches() {
|
||||
return matches;
|
||||
}
|
||||
|
||||
public void setMatches(final List<Match> matches) {
|
||||
this.matches = matches;
|
||||
}
|
||||
|
||||
public Set<Author> getCoauthors() {
|
||||
return coauthors;
|
||||
}
|
||||
|
||||
public void setCoauthors(final Set<Author> coauthors) {
|
||||
this.coauthors = coauthors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getId().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final Author o) {
|
||||
return ComparisonChain.start()
|
||||
.compare(this.getId(), o.getId(), Ordering.natural().nullsLast())
|
||||
.result();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object o) {
|
||||
return (o instanceof Author) && getId().equals(((Author) o).getId());
|
||||
}
|
||||
|
||||
public SubjectsMap getSubjectsMap() {
|
||||
return subjectsMap;
|
||||
}
|
||||
|
||||
public void setSubjectsMap(final SubjectsMap subjectsMap) {
|
||||
this.subjectsMap = subjectsMap;
|
||||
}
|
||||
}
|
|
@ -1,37 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class AuthorSet {
|
||||
|
||||
private String id;
|
||||
private Authors authors;
|
||||
|
||||
public AuthorSet(final String id, final Authors authors) {
|
||||
super();
|
||||
this.id = id;
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public Authors getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(final Authors authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,54 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Ordering;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class Authors extends HashSet<Author> implements Comparable<Authors> {
|
||||
|
||||
private static final long serialVersionUID = -6878376220805286142L;
|
||||
|
||||
public Authors() {
|
||||
super();
|
||||
}
|
||||
|
||||
public Authors(final Collection<Author> authors) {
|
||||
super(authors);
|
||||
}
|
||||
|
||||
public Authors(final Author author) {
|
||||
super(Sets.newHashSet(author));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final Authors a) {
|
||||
return ComparisonChain.start()
|
||||
.compare(this.size(), a.size(), Ordering.natural().nullsLast())
|
||||
.result();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object o) {
|
||||
final boolean res = o instanceof Authors;
|
||||
return res && (Sets.intersection(this, (Authors) o).size() == this.size());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int res = 0;
|
||||
for (final Author a : this) {
|
||||
res += a.hashCode();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class CoAuthor extends Author {
|
||||
|
||||
private static final Log log = LogFactory.getLog(CoAuthor.class);
|
||||
private String anchorId = null;
|
||||
|
||||
public CoAuthor() {
|
||||
super();
|
||||
}
|
||||
|
||||
public CoAuthor(final Author author) {
|
||||
super(author);
|
||||
}
|
||||
|
||||
public boolean hasAnchorId() {
|
||||
return StringUtils.isNotBlank(getAnchorId());
|
||||
}
|
||||
|
||||
public String getAnchorId() {
|
||||
return anchorId;
|
||||
}
|
||||
|
||||
public void setAnchorId(final String anchorId) {
|
||||
this.anchorId = anchorId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getId() != null ? getId().hashCode() : getFullname().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object o) {
|
||||
return (o instanceof CoAuthor) && StringUtils.isNotBlank(getId()) ?
|
||||
getId().equals(((CoAuthor) o).getId()) :
|
||||
getFullname().equals(((CoAuthor) o).getFullname());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,36 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class CoAuthorSet {
|
||||
|
||||
private Author author;
|
||||
private Authors coAuthors;
|
||||
|
||||
public CoAuthorSet(final Author author, final Authors coAuthors) {
|
||||
super();
|
||||
this.author = author;
|
||||
this.coAuthors = coAuthors;
|
||||
}
|
||||
|
||||
public Author getAuthor() {
|
||||
return author;
|
||||
}
|
||||
|
||||
public void setAuthor(final Author author) {
|
||||
this.author = author;
|
||||
}
|
||||
|
||||
public Authors getCoAuthors() {
|
||||
return coAuthors;
|
||||
}
|
||||
|
||||
public void setCoAuthors(final Authors coAuthors) {
|
||||
this.coAuthors = coAuthors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class CoAuthorSetLite {
|
||||
|
||||
private String id;
|
||||
|
||||
private Set<String> coAuthors;
|
||||
|
||||
public CoAuthorSetLite(final String id, final Set<String> coAuthors) {
|
||||
super();
|
||||
this.id = id;
|
||||
this.coAuthors = coAuthors;
|
||||
}
|
||||
|
||||
public Set<String> getCoAuthors() {
|
||||
return coAuthors;
|
||||
}
|
||||
|
||||
public void setCoAuthors(final Set<String> coAuthors) {
|
||||
this.coAuthors = coAuthors;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,78 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Ordering;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class CoAuthors extends HashSet<CoAuthor> implements Comparable<CoAuthors> {
|
||||
|
||||
private static final long serialVersionUID = 2525591524516562892L;
|
||||
|
||||
private Function<CoAuthors, Integer> hashFunction;
|
||||
|
||||
private static Function<CoAuthors, Integer> defaultHashFunction = new Function<CoAuthors, Integer>() {
|
||||
|
||||
@Override
|
||||
public Integer apply(final CoAuthors input) {
|
||||
int res = 0;
|
||||
for (final CoAuthor a : input) {
|
||||
res += a.hashCode();
|
||||
}
|
||||
return res;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
public CoAuthors() {
|
||||
super();
|
||||
}
|
||||
|
||||
public CoAuthors(final Collection<CoAuthor> coauthors) {
|
||||
super(coauthors);
|
||||
}
|
||||
|
||||
public CoAuthors(final CoAuthor coauthor) {
|
||||
super(Sets.newHashSet(coauthor));
|
||||
}
|
||||
|
||||
public Function<CoAuthors, Integer> getHashFunction() {
|
||||
return hashFunction;
|
||||
}
|
||||
|
||||
public void setHashFunction(final Function<CoAuthors, Integer> hashFunction) {
|
||||
this.hashFunction = hashFunction;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final CoAuthors a) {
|
||||
return ComparisonChain.start()
|
||||
.compare(this.size(), a.size(), Ordering.natural().nullsLast())
|
||||
.result();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object o) {
|
||||
final boolean res = o instanceof CoAuthors;
|
||||
return res && (Sets.intersection(this, (CoAuthors) o).size() == this.size());
|
||||
}
|
||||
|
||||
public String hashCodeString() {
|
||||
return String.valueOf(hashCode());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (getHashFunction() != null) ? getHashFunction().apply(this) : defaultHashFunction.apply(this);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,196 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Ordering;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
|
||||
public class GTAuthor implements Comparable<GTAuthor> {
|
||||
|
||||
private String id;
|
||||
private Author author;
|
||||
private Authors merged;
|
||||
private CoAuthors coAuthors;
|
||||
private boolean anchor;
|
||||
|
||||
public GTAuthor() {}
|
||||
|
||||
public GTAuthor(final String id, final Authors merged, final CoAuthors coAuthors, final boolean anchor) {
|
||||
super();
|
||||
|
||||
if ((merged == null) || merged.isEmpty())
|
||||
throw new IllegalArgumentException("empty merged author set, id: " + id);
|
||||
|
||||
this.author = pickAuthor(merged);
|
||||
this.id = id;
|
||||
this.merged = merged;
|
||||
this.coAuthors = coAuthors;
|
||||
this.anchor = anchor;
|
||||
}
|
||||
|
||||
class AuthorFrequency extends Author {
|
||||
|
||||
private Integer frequency = new Integer(1);
|
||||
|
||||
public AuthorFrequency(final Author a) {
|
||||
super(a);
|
||||
}
|
||||
|
||||
public void increment() {
|
||||
setFrequency(getFrequency() + 1);
|
||||
}
|
||||
|
||||
public Integer getFrequency() {
|
||||
return frequency;
|
||||
}
|
||||
|
||||
public void setFrequency(final Integer frequency) {
|
||||
this.frequency = frequency;
|
||||
}
|
||||
}
|
||||
|
||||
private Author pickAuthor(final Authors merged) {
|
||||
final List<AuthorFrequency> freq = getFrequencies(merged);
|
||||
Collections.sort(freq, Collections.reverseOrder(new Comparator<AuthorFrequency>() {
|
||||
|
||||
@Override
|
||||
public int compare(final AuthorFrequency o1, final AuthorFrequency o2) {
|
||||
return ComparisonChain.start().compare(o1.getFullname().length(), o2.getFullname().length()).compare(o1.getFrequency(), o2.getFrequency())
|
||||
.result();
|
||||
}
|
||||
}));
|
||||
|
||||
return Iterables.getFirst(freq, null);
|
||||
}
|
||||
|
||||
private List<AuthorFrequency> getFrequencies(final Authors merged) {
|
||||
final Map<String, Integer> countMap = Maps.newHashMap();
|
||||
for (final Author a : merged) {
|
||||
final Integer count = countMap.get(a.getFullname());
|
||||
if (count == null) {
|
||||
countMap.put(a.getFullname(), new Integer(1));
|
||||
} else {
|
||||
countMap.put(a.getFullname(), count + 1);
|
||||
}
|
||||
}
|
||||
|
||||
return Lists.newArrayList(Iterables.transform(merged, new Function<Author, AuthorFrequency>() {
|
||||
|
||||
@Override
|
||||
public AuthorFrequency apply(final Author a) {
|
||||
final AuthorFrequency af = new AuthorFrequency(a);
|
||||
final Integer freq = countMap.get(af.getFullname());
|
||||
af.setFrequency(freq);
|
||||
return af;
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public Author getAuthor() {
|
||||
return author;
|
||||
}
|
||||
|
||||
public void setAuthor(final Author author) {
|
||||
this.author = author;
|
||||
}
|
||||
|
||||
public boolean hasMerged() {
|
||||
return (getMerged() != null) && !getMerged().isEmpty();
|
||||
}
|
||||
|
||||
public Authors getMerged() {
|
||||
return merged;
|
||||
}
|
||||
|
||||
public void setMerged(final Authors merged) {
|
||||
this.merged = merged;
|
||||
}
|
||||
|
||||
public boolean hasCoAuthors() {
|
||||
return (getCoAuthors() != null) && !getCoAuthors().isEmpty();
|
||||
}
|
||||
|
||||
public CoAuthors getCoAuthors() {
|
||||
return coAuthors;
|
||||
}
|
||||
|
||||
public void setCoAuthors(final CoAuthors coAuthors) {
|
||||
this.coAuthors = coAuthors;
|
||||
}
|
||||
|
||||
public boolean isAnchor() {
|
||||
return anchor;
|
||||
}
|
||||
|
||||
public void setAnchor(final boolean anchor) {
|
||||
this.anchor = anchor;
|
||||
}
|
||||
|
||||
public static GTAuthor fromJson(final String json) {
|
||||
final Gson gson = new Gson();
|
||||
return gson.fromJson(json, GTAuthor.class);
|
||||
}
|
||||
|
||||
public static List<GTAuthor> fromOafJson(final List<String> json) {
|
||||
|
||||
final GsonBuilder gb = new GsonBuilder();
|
||||
gb.registerTypeAdapter(GTAuthor.class, new GTAuthorOafSerialiser());
|
||||
final Gson gson = gb.create();
|
||||
|
||||
return Lists.newArrayList(Iterables.transform(json, new Function<String, GTAuthor>() {
|
||||
@Override
|
||||
public GTAuthor apply(final String s) {
|
||||
return gson.fromJson(s, GTAuthor.class);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
public static GTAuthor fromOafJson(final String json) {
|
||||
|
||||
final GsonBuilder gb = new GsonBuilder();
|
||||
gb.registerTypeAdapter(GTAuthor.class, new GTAuthorOafSerialiser());
|
||||
final Gson gson = gb.create();
|
||||
|
||||
return gson.fromJson(json, GTAuthor.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getId().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final GTAuthor o) {
|
||||
return ComparisonChain.start()
|
||||
.compare(this.getId(), o.getId(), Ordering.natural().nullsLast())
|
||||
.result();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object o) {
|
||||
return (o instanceof GTAuthor) && getId().equals(((GTAuthor) o).getId());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,104 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.lang.reflect.Type;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gson.JsonDeserializationContext;
|
||||
import com.google.gson.JsonDeserializer;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonObject;
|
||||
import com.google.gson.JsonParseException;
|
||||
|
||||
public class GTAuthorOafSerialiser implements JsonDeserializer<GTAuthor> {
|
||||
|
||||
private static final String VALUE = "value";
|
||||
private static final String SECONDNAMES = "secondnames";
|
||||
private static final String FIRSTNAME = "firstname";
|
||||
private static final String FULLNAME = "fullname";
|
||||
private static final String ID = "id";
|
||||
private static final String MERGEDPERSON = "mergedperson";
|
||||
private static final String METADATA = "metadata";
|
||||
private static final String ANCHOR_ID = "anchorId";
|
||||
private static final String COAUTHOR = "coauthor";
|
||||
|
||||
@Override
|
||||
public GTAuthor deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
|
||||
final GTAuthor gta = new GTAuthor();
|
||||
|
||||
gta.setAuthor(getAuthor(json));
|
||||
gta.setMerged(getMerged(json));
|
||||
|
||||
gta.setCoAuthors(getCoAuthors(json));
|
||||
|
||||
return gta;
|
||||
}
|
||||
|
||||
private CoAuthors getCoAuthors(final JsonElement json) {
|
||||
final JsonObject obj = json.getAsJsonObject();
|
||||
if (!obj.has(COAUTHOR)) return null;
|
||||
return new CoAuthors(Lists.newArrayList(Iterables.transform(obj.get(COAUTHOR).getAsJsonArray(),
|
||||
new Function<JsonElement, CoAuthor>() {
|
||||
|
||||
@Override
|
||||
public CoAuthor apply(final JsonElement in) {
|
||||
final CoAuthor a = new CoAuthor(getAuthor(in));
|
||||
final JsonObject jsonObject = in.getAsJsonObject();
|
||||
if (jsonObject.has(ANCHOR_ID)) {
|
||||
a.setAnchorId(jsonObject.get(ANCHOR_ID).getAsString());
|
||||
}
|
||||
return a;
|
||||
}
|
||||
})));
|
||||
}
|
||||
|
||||
private Author getAuthor(final JsonElement json) {
|
||||
|
||||
final Author a = new Author();
|
||||
a.setCoauthors(null);
|
||||
a.setMatches(null);
|
||||
|
||||
final JsonObject jso = json.getAsJsonObject();
|
||||
|
||||
a.setId(jso.has(ID) ? jso.get(ID).getAsString() : null);
|
||||
|
||||
final JsonObject jsonObject = json.getAsJsonObject();
|
||||
if (jsonObject.has(METADATA)) {
|
||||
final JsonObject m = jsonObject.get(METADATA).getAsJsonObject();
|
||||
a.setFullname(getValue(m, FULLNAME));
|
||||
a.setFirstname(getValue(m, FIRSTNAME));
|
||||
a.setSecondnames(getValues(m, SECONDNAMES));
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
private Authors getMerged(final JsonElement json) {
|
||||
final JsonObject obj = json.getAsJsonObject();
|
||||
if (!obj.has(MERGEDPERSON)) return null;
|
||||
return new Authors(Lists.newArrayList(Iterables.transform(obj.get(MERGEDPERSON).getAsJsonArray(),
|
||||
new Function<JsonElement, Author>() {
|
||||
|
||||
@Override
|
||||
public Author apply(final JsonElement in) {
|
||||
return getAuthor(in);
|
||||
}
|
||||
})));
|
||||
}
|
||||
|
||||
private String getValues(final JsonObject m, final String fieldName) {
|
||||
return m.has(fieldName) ? Joiner.on(" ").join(Iterables.transform(m.get(fieldName).getAsJsonArray(), new Function<JsonElement, String>() {
|
||||
|
||||
@Override
|
||||
public String apply(final JsonElement in) {
|
||||
return in.getAsJsonObject().get(VALUE).getAsString();
|
||||
}
|
||||
})) : null;
|
||||
}
|
||||
|
||||
private String getValue(final JsonObject m, final String fieldName) {
|
||||
return m.has(fieldName) ? m.get(fieldName).getAsJsonObject().get(VALUE).getAsString() : null;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class Group {
|
||||
|
||||
private String id;
|
||||
private int size;
|
||||
private List<Result> results;
|
||||
|
||||
public Group() {}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public int getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public void setSize(final int size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
public List<Result> getResults() {
|
||||
return results;
|
||||
}
|
||||
|
||||
public void setResults(final List<Result> results) {
|
||||
this.results = results;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,41 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class InvertedAuthor {
|
||||
|
||||
private Author author;
|
||||
private Collection<String> ids;
|
||||
|
||||
public InvertedAuthor() {}
|
||||
|
||||
public InvertedAuthor(final Author author, final Collection<String> ids) {
|
||||
super();
|
||||
this.author = author;
|
||||
this.ids = ids;
|
||||
}
|
||||
|
||||
public Author getAuthor() {
|
||||
return author;
|
||||
}
|
||||
|
||||
public void setAuthor(final Author author) {
|
||||
this.author = author;
|
||||
}
|
||||
|
||||
public Collection<String> getIds() {
|
||||
return ids;
|
||||
}
|
||||
|
||||
public void setIds(final Collection<String> ids) {
|
||||
this.ids = ids;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
public class Match extends Author {
|
||||
|
||||
private double score;
|
||||
|
||||
public Match() {
|
||||
super();
|
||||
}
|
||||
|
||||
public static Match from(final Author a) {
|
||||
final Match m = new Match();
|
||||
if (a.isWellFormed()) {
|
||||
m.setFirstname(a.getFirstname());
|
||||
m.setSecondnames(a.getSecondnames());
|
||||
}
|
||||
m.setFullname(a.getFullname());
|
||||
m.setId(a.getId());
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
public void setScore(final double score) {
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,72 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Ordering;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
public class Result implements Comparable<Result> {
|
||||
|
||||
private String id;
|
||||
private String originalId;
|
||||
private String title;
|
||||
private List<Author> authors;
|
||||
|
||||
private double meanDistance;
|
||||
|
||||
public Result() {}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getOriginalId() {
|
||||
return originalId;
|
||||
}
|
||||
|
||||
public void setOriginalId(final String originalId) {
|
||||
this.originalId = originalId;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(final String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public List<Author> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(final List<Author> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final Result o) {
|
||||
return ComparisonChain.start()
|
||||
.compare(this.getAuthors().size(), o.getAuthors().size(), Ordering.natural().nullsLast())
|
||||
.result();
|
||||
}
|
||||
|
||||
public double getMeanDistance() {
|
||||
return meanDistance;
|
||||
}
|
||||
|
||||
public void setMeanDistance(final double meanDistance) {
|
||||
this.meanDistance = meanDistance;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,10 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Created by claudio on 07/03/16.
|
||||
*/
|
||||
public class Subjects extends HashMap<String, Integer> {
|
||||
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
package eu.dnetlib.pace.model.gt;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
/**
|
||||
* Created by claudio on 07/03/16.
|
||||
*/
|
||||
public class SubjectsMap extends HashMap<String, Subjects> {
|
||||
|
||||
public SubjectsMap mergeFrom(SubjectsMap sm) {
|
||||
|
||||
for(Entry<String, Subjects> e : sm.entrySet()) {
|
||||
if (!this.containsKey(e.getKey())) {
|
||||
Subjects sub = new Subjects();
|
||||
|
||||
sub.putAll(e.getValue());
|
||||
|
||||
this.put(e.getKey(), sub);
|
||||
} else {
|
||||
for (Entry<String, Integer> es : e.getValue().entrySet()) {
|
||||
final Subjects subjects = this.get(e.getKey());
|
||||
if (subjects.containsKey(es.getKey())) {
|
||||
subjects.put(es.getKey(), es.getValue() + subjects.get(es.getKey()));
|
||||
} else {
|
||||
subjects.put(es.getKey(), new Integer(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
|
@ -5,7 +5,7 @@ import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.pace.tree;
|
|||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
|
|
|
@ -4,7 +4,7 @@ import eu.dnetlib.pace.config.Config;
|
|||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
|
|
@ -1,14 +1,11 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import eu.dnetlib.pace.model.gt.Match;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The compare between two documents is given by the weighted mean of the field distances
|
||||
|
|
|
@ -8,7 +8,7 @@ import eu.dnetlib.pace.tree.support.TreeProcessor;
|
|||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import org.apache.commons.lang.WordUtils;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
public class Capitalise implements Function<String, String> {
|
||||
|
||||
private final char[] DELIM = { ' ', '-' };
|
||||
private final char[] DELIM = {' ', '-'};
|
||||
|
||||
@Override
|
||||
public String apply(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||
}
|
||||
@Override
|
||||
public String apply(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||
}
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue