Optimizations for the Openorgs Dedup: normalization and inference of strings and implementation of new general-purpose comparators #455
|
@ -2,31 +2,41 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
@ClusteringClass("keywordsclustering")
|
@ClusteringClass("legalnameclustering")
|
||||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
public class LegalnameClustering extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public KeywordsClustering(Map<String, Object> params) {
|
private static final Pattern CITY_CODE_PATTERN = Pattern.compile("city::\\d+");
|
||||||
|
private static final Pattern KEYWORD_CODE_PATTERN = Pattern.compile("key::\\d+");
|
||||||
|
|
||||||
|
public LegalnameClustering(Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<String> getRegexList(String input, Pattern codeRegex) {
|
||||||
|
Matcher matcher = codeRegex.matcher(input);
|
||||||
|
Set<String> cities = new HashSet<>();
|
||||||
|
while (matcher.find()) {
|
||||||
|
cities.add(matcher.group());
|
||||||
|
}
|
||||||
|
return cities;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final Config conf, String s) {
|
protected Collection<String> doApply(final Config conf, String s) {
|
||||||
|
|
||||||
// takes city codes and keywords codes without duplicates
|
|
||||||
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
|
|
||||||
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
|
|
||||||
|
|
||||||
// list of combination to return as result
|
// list of combination to return as result
|
||||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||||
|
|
||||||
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
|
for (String keyword : getRegexList(s, KEYWORD_CODE_PATTERN)) {
|
||||||
for (String city : citiesToCodes(cities)) {
|
for (String city : getRegexList(s, CITY_CODE_PATTERN)) {
|
||||||
combinations.add(keyword + "-" + city);
|
combinations.add(keyword + "-" + city);
|
||||||
if (combinations.size() >= paramOrDefault("max", 2)) {
|
if (combinations.size() >= paramOrDefault("max", 2)) {
|
||||||
return combinations;
|
return combinations;
|
||||||
|
@ -42,9 +52,6 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(KeywordsClustering::cleanup)
|
|
||||||
.map(KeywordsClustering::normalize)
|
|
||||||
.map(s -> filterAllStopWords(s))
|
|
||||||
.map(s -> doApply(conf, s))
|
.map(s -> doApply(conf, s))
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
.flatMap(c -> c.stream())
|
.flatMap(c -> c.stream())
|
|
@ -27,6 +27,14 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||||
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||||
|
|
||||||
|
// keywords map to be used when translating the keyword names into codes
|
||||||
|
private static Map<String, String> keywordMap = AbstractPaceFunctions
|
||||||
|
.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||||
|
|
||||||
|
// country map to be used when inferring the country from the city name
|
||||||
|
private static Map<String, String> countryMap = AbstractPaceFunctions
|
||||||
|
.loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
|
||||||
|
|
||||||
// list of stopwords in different languages
|
// list of stopwords in different languages
|
||||||
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
||||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||||
|
@ -74,6 +82,64 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
return s12;
|
return s12;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String countryInference(final String original, String inferFrom) {
|
||||||
|
if (!original.equalsIgnoreCase("unknown"))
|
||||||
|
return original;
|
||||||
|
|
||||||
|
inferFrom = cleanup(inferFrom);
|
||||||
|
inferFrom = normalize(inferFrom);
|
||||||
|
inferFrom = filterAllStopWords(inferFrom);
|
||||||
|
Set<String> cities = getCities(inferFrom, 4);
|
||||||
|
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String cityInference(String original) {
|
||||||
|
original = cleanup(original);
|
||||||
|
original = normalize(original);
|
||||||
|
original = filterAllStopWords(original);
|
||||||
|
|
||||||
|
Set<String> cities = getCities(original, 4);
|
||||||
|
|
||||||
|
for (String city : cities) {
|
||||||
|
original = original.replaceAll(city, cityMap.get(city));
|
||||||
|
}
|
||||||
|
|
||||||
|
return original;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String keywordInference(String original) {
|
||||||
|
original = cleanup(original);
|
||||||
|
original = normalize(original);
|
||||||
|
original = filterAllStopWords(original);
|
||||||
|
|
||||||
|
Set<String> keywords = getKeywords(original, keywordMap, 4);
|
||||||
|
|
||||||
|
for (String keyword : keywords) {
|
||||||
|
original = original.replaceAll(keyword, keywordMap.get(keyword));
|
||||||
|
}
|
||||||
|
|
||||||
|
return original;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String cityKeywordInference(String original) {
|
||||||
|
original = cleanup(original);
|
||||||
|
original = normalize(original);
|
||||||
|
original = filterAllStopWords(original);
|
||||||
|
|
||||||
|
Set<String> keywords = getKeywords(original, keywordMap, 4);
|
||||||
|
Set<String> cities = getCities(original, 4);
|
||||||
|
|
||||||
|
for (String keyword : keywords) {
|
||||||
|
original = original.replaceAll(keyword, keywordMap.get(keyword));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String city : cities) {
|
||||||
|
original = original.replaceAll(city, cityMap.get(city));
|
||||||
|
}
|
||||||
|
|
||||||
|
return original;
|
||||||
|
}
|
||||||
|
|
||||||
protected static String fixXML(final String a) {
|
protected static String fixXML(final String a) {
|
||||||
|
|
||||||
return a
|
return a
|
||||||
|
@ -208,6 +274,30 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Map<String, String> loadCountryMapFromClasspath(final String classpath) {
|
||||||
|
|
||||||
|
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
|
||||||
|
final Map<String, String> m = new HashMap<>();
|
||||||
|
try {
|
||||||
|
for (final String s : IOUtils
|
||||||
|
.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||||
|
// string is like this: country_code;city1;city2;city3
|
||||||
|
String[] line = s.split(";");
|
||||||
|
String value = line[0];
|
||||||
|
for (int i = 1; i < line.length; i++) {
|
||||||
|
String city = fixAliases(transliterator.transliterate(line[i].toLowerCase()));
|
||||||
|
String code = cityMap.get(city);
|
||||||
|
m.put(code, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
return new HashMap<>();
|
||||||
|
}
|
||||||
|
return m;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public static String removeKeywords(String s, Set<String> keywords) {
|
public static String removeKeywords(String s, Set<String> keywords) {
|
||||||
|
|
||||||
s = " " + s + " ";
|
s = " " + s + " ";
|
||||||
|
@ -237,6 +327,10 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
return toCodes(keywords, cityMap);
|
return toCodes(keywords, cityMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Set<String> citiesToCountry(Set<String> cities) {
|
||||||
|
return toCodes(toCodes(cities, cityMap), countryMap);
|
||||||
|
}
|
||||||
|
|
||||||
protected static String firstLC(final String s) {
|
protected static String firstLC(final String s) {
|
||||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,9 +47,21 @@ public class FieldDef implements Serializable {
|
||||||
|
|
||||||
private String clean;
|
private String clean;
|
||||||
|
|
||||||
|
private String infer;
|
||||||
|
|
||||||
|
private String inferenceFrom;
|
||||||
|
|
||||||
public FieldDef() {
|
public FieldDef() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getInferenceFrom() {
|
||||||
|
return inferenceFrom;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setInferenceFrom(final String inferenceFrom) {
|
||||||
|
this.inferenceFrom = inferenceFrom;
|
||||||
|
}
|
||||||
|
|
||||||
public String getName() {
|
public String getName() {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
@ -126,6 +138,14 @@ public class FieldDef implements Serializable {
|
||||||
this.clean = clean;
|
this.clean = clean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getInfer() {
|
||||||
|
return infer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setInfer(String infer) {
|
||||||
|
this.infer = infer;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -123,9 +123,19 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
case _ => res(index)
|
case _ => res(index)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (StringUtils.isNotBlank(fdef.getInfer)) {
|
||||||
|
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
|
||||||
|
res(index) = res(index) match {
|
||||||
|
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
|
||||||
|
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
res
|
res
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
new GenericRowWithSchema(values, schema)
|
new GenericRowWithSchema(values, schema)
|
||||||
|
@ -146,5 +156,17 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def inference(value: String, inferfrom: String, infertype: String) : String = {
|
||||||
|
val res = infertype match {
|
||||||
|
case "country" => AbstractPaceFunctions.countryInference(value, inferfrom)
|
||||||
|
case "city" => AbstractPaceFunctions.cityInference(value)
|
||||||
|
case "keyword" => AbstractPaceFunctions.keywordInference(value)
|
||||||
|
case "city_keyword" => AbstractPaceFunctions.cityKeywordInference(value)
|
||||||
|
case _ => value
|
||||||
|
}
|
||||||
|
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,48 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
|
||||||
|
|
||||||
@ComparatorClass("cityMatch")
|
|
||||||
public class CityMatch extends AbstractStringComparator {
|
|
||||||
|
|
||||||
private Map<String, String> params;
|
|
||||||
|
|
||||||
public CityMatch(Map<String, String> params) {
|
|
||||||
super(params);
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
|
||||||
|
|
||||||
String ca = cleanup(a);
|
|
||||||
String cb = cleanup(b);
|
|
||||||
|
|
||||||
ca = normalize(ca);
|
|
||||||
cb = normalize(cb);
|
|
||||||
|
|
||||||
ca = filterAllStopWords(ca);
|
|
||||||
cb = filterAllStopWords(cb);
|
|
||||||
|
|
||||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
|
|
||||||
Set<String> codes1 = citiesToCodes(cities1);
|
|
||||||
Set<String> codes2 = citiesToCodes(cities2);
|
|
||||||
|
|
||||||
// if no cities are detected, the comparator gives 1.0
|
|
||||||
if (codes1.isEmpty() && codes2.isEmpty())
|
|
||||||
return 1.0;
|
|
||||||
else {
|
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
|
||||||
return -1; // undefined if one of the two has no cities
|
|
||||||
return commonElementsPercentage(codes1, codes2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ComparatorClass("codeMatch")
|
||||||
|
public class CodeMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
private Pattern CODE_REGEX;
|
||||||
|
|
||||||
|
public CodeMatch(Map<String, String> params) {
|
||||||
|
super(params);
|
||||||
|
this.params = params;
|
||||||
|
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getRegexList(String input) {
|
||||||
|
Matcher matcher = this.CODE_REGEX.matcher(input);
|
||||||
|
Set<String> cities = new HashSet<>();
|
||||||
|
while (matcher.find()) {
|
||||||
|
cities.add(matcher.group());
|
||||||
|
}
|
||||||
|
return cities;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
|
||||||
|
Set<String> codes1 = getRegexList(a);
|
||||||
|
Set<String> codes2 = getRegexList(b);
|
||||||
|
|
||||||
|
// if no codes are detected, the comparator gives 1.0
|
||||||
|
if (codes1.isEmpty() && codes2.isEmpty())
|
||||||
|
return 1.0;
|
||||||
|
else {
|
||||||
|
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||||
|
return -1; // undefined if one of the two has no codes
|
||||||
|
return commonElementsPercentage(codes1, codes2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
@ -11,37 +13,42 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
@ComparatorClass("countryMatch")
|
@ComparatorClass("countryMatch")
|
||||||
public class CountryMatch extends AbstractStringComparator {
|
public class CountryMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
public CountryMatch(Map<String, String> params) {
|
private Map<String, String> params;
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
|
||||||
}
|
|
||||||
|
|
||||||
public CountryMatch(final double weight) {
|
public CountryMatch(Map<String, String> params) {
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
|
public CountryMatch(final double weight) {
|
||||||
super(weight, ssalgo);
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
super(weight, ssalgo);
|
||||||
if (a.isEmpty() || b.isEmpty()) {
|
}
|
||||||
return -1.0; // return -1 if a field is missing
|
|
||||||
}
|
|
||||||
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
|
|
||||||
return -1.0; // return -1 if a country is UNKNOWN
|
|
||||||
}
|
|
||||||
|
|
||||||
return a.equals(b) ? 1.0 : 0;
|
@Override
|
||||||
}
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
|
||||||
@Override
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
public double getWeight() {
|
return -1.0; // return -1 if a field is missing
|
||||||
return super.weight;
|
}
|
||||||
}
|
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
|
||||||
|
return -1.0; // return -1 if a country is UNKNOWN
|
||||||
|
}
|
||||||
|
|
||||||
|
return a.equals(b) ? 1.0 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double getWeight() {
|
||||||
|
return super.weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected double normalize(final double d) {
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
protected double normalize(final double d) {
|
|
||||||
return d;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ComparatorClass("jaroWinklerLegalname")
|
||||||
|
public class JaroWinklerLegalname extends AbstractStringComparator {
|
||||||
|
|
||||||
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
private final String CITY_CODE_REGEX = "city::\\d+";
|
||||||
|
private final String KEYWORD_CODE_REGEX = "key::\\d+";
|
||||||
|
|
||||||
|
public JaroWinklerLegalname(Map<String, String> params) {
|
||||||
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
|
public JaroWinklerLegalname(double weight) {
|
||||||
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected JaroWinklerLegalname(double weight, AbstractStringDistance ssalgo) {
|
||||||
|
super(weight, ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double distance(String a, String b, final Config conf) {
|
||||||
|
|
||||||
|
String ca = a.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
|
||||||
|
String cb = b.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
|
||||||
|
|
||||||
|
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||||
|
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||||
|
|
||||||
|
if (ca.isEmpty() && cb.isEmpty())
|
||||||
|
return 1.0;
|
||||||
|
else
|
||||||
|
return normalize(ssalgo.score(ca, cb));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double getWeight() {
|
||||||
|
return super.weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected double normalize(double d) {
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,74 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
|
||||||
|
|
||||||
@ComparatorClass("jaroWinklerNormalizedName")
|
|
||||||
public class JaroWinklerNormalizedName extends AbstractStringComparator {
|
|
||||||
|
|
||||||
private Map<String, String> params;
|
|
||||||
|
|
||||||
public JaroWinklerNormalizedName(Map<String, String> params) {
|
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
public JaroWinklerNormalizedName(double weight) {
|
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
|
||||||
}
|
|
||||||
|
|
||||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
|
||||||
super(weight, ssalgo);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double distance(String a, String b, final Config conf) {
|
|
||||||
String ca = cleanup(a);
|
|
||||||
String cb = cleanup(b);
|
|
||||||
|
|
||||||
ca = normalize(ca);
|
|
||||||
cb = normalize(cb);
|
|
||||||
|
|
||||||
ca = filterAllStopWords(ca);
|
|
||||||
cb = filterAllStopWords(cb);
|
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(
|
|
||||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
Set<String> keywords2 = getKeywords(
|
|
||||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
|
|
||||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
|
|
||||||
ca = removeKeywords(ca, keywords1);
|
|
||||||
ca = removeKeywords(ca, cities1);
|
|
||||||
cb = removeKeywords(cb, keywords2);
|
|
||||||
cb = removeKeywords(cb, cities2);
|
|
||||||
|
|
||||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
|
||||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
|
||||||
|
|
||||||
if (ca.isEmpty() && cb.isEmpty())
|
|
||||||
return 1.0;
|
|
||||||
else
|
|
||||||
return normalize(ssalgo.score(ca, cb));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double getWeight() {
|
|
||||||
return super.weight;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected double normalize(double d) {
|
|
||||||
return d;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,50 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
|
||||||
|
|
||||||
@ComparatorClass("keywordMatch")
|
|
||||||
public class KeywordMatch extends AbstractStringComparator {
|
|
||||||
|
|
||||||
Map<String, String> params;
|
|
||||||
|
|
||||||
public KeywordMatch(Map<String, String> params) {
|
|
||||||
super(params);
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
|
||||||
|
|
||||||
String ca = cleanup(a);
|
|
||||||
String cb = cleanup(b);
|
|
||||||
|
|
||||||
ca = normalize(ca);
|
|
||||||
cb = normalize(cb);
|
|
||||||
|
|
||||||
ca = filterAllStopWords(ca);
|
|
||||||
cb = filterAllStopWords(cb);
|
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(
|
|
||||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
Set<String> keywords2 = getKeywords(
|
|
||||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
|
|
||||||
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
|
||||||
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
|
||||||
|
|
||||||
// if no cities are detected, the comparator gives 1.0
|
|
||||||
if (codes1.isEmpty() && codes2.isEmpty())
|
|
||||||
return 1.0;
|
|
||||||
else {
|
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
|
||||||
return -1.0; // undefined if one of the two has no keywords
|
|
||||||
return commonElementsPercentage(codes1, codes2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
|
||||||
// function for the evaluation of the node
|
// function for the evaluation of the node
|
||||||
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||||
|
|
||||||
TreeNodeStats stats = new TreeNodeStats();
|
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
|
||||||
|
|
||||||
// for each field in the node, it computes the
|
// for each field in the node, it computes the
|
||||||
for (FieldConf fieldConf : fields) {
|
for (FieldConf fieldConf : fields) {
|
||||||
|
|
|
@ -9,8 +9,11 @@ public class TreeNodeStats implements Serializable {
|
||||||
|
|
||||||
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
||||||
|
|
||||||
public TreeNodeStats() {
|
private final boolean ignoreUndefined;
|
||||||
|
|
||||||
|
public TreeNodeStats(boolean ignoreUndefined) {
|
||||||
this.results = new HashMap<>();
|
this.results = new HashMap<>();
|
||||||
|
this.ignoreUndefined = ignoreUndefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, FieldStats> getResults() {
|
public Map<String, FieldStats> getResults() {
|
||||||
|
@ -22,7 +25,10 @@ public class TreeNodeStats implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int fieldsCount() {
|
public int fieldsCount() {
|
||||||
return this.results.size();
|
if(ignoreUndefined)
|
||||||
|
return this.results.size();
|
||||||
|
else
|
||||||
|
return this.results.size() - undefinedCount(); //do not count undefined
|
||||||
}
|
}
|
||||||
|
|
||||||
public int undefinedCount() {
|
public int undefinedCount() {
|
||||||
|
@ -78,11 +84,25 @@ public class TreeNodeStats implements Serializable {
|
||||||
double min = 100.0; // random high value
|
double min = 100.0; // random high value
|
||||||
for (FieldStats fs : this.results.values()) {
|
for (FieldStats fs : this.results.values()) {
|
||||||
if (fs.getResult() < min) {
|
if (fs.getResult() < min) {
|
||||||
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
if (fs.getResult() == -1) {
|
||||||
|
if (fs.isCountIfUndefined()) {
|
||||||
|
min = 0.0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
min = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
min = fs.getResult();
|
min = fs.getResult();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return min;
|
if (ignoreUndefined) {
|
||||||
|
return min==-1.0? 0.0 : min;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return min;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if at least one is true, return 1.0
|
// if at least one is true, return 1.0
|
||||||
|
@ -91,7 +111,11 @@ public class TreeNodeStats implements Serializable {
|
||||||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||||
return 1.0;
|
return 1.0;
|
||||||
}
|
}
|
||||||
return 0.0;
|
if (!ignoreUndefined && undefinedCount()>0){
|
||||||
|
return -1.0;
|
||||||
|
} else {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if at least one is false, return 0.0
|
// if at least one is false, return 0.0
|
||||||
|
@ -100,7 +124,7 @@ public class TreeNodeStats implements Serializable {
|
||||||
|
|
||||||
if (fieldStats.getResult() == -1) {
|
if (fieldStats.getResult() == -1) {
|
||||||
if (fieldStats.isCountIfUndefined())
|
if (fieldStats.isCountIfUndefined())
|
||||||
return 0.0;
|
return ignoreUndefined? 0.0 : -1.0;
|
||||||
} else {
|
} else {
|
||||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
|
@ -43,18 +43,17 @@ public class TreeProcessor {
|
||||||
|
|
||||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||||
treeStats.addNodeStats(nextNodeName, stats);
|
treeStats.addNodeStats(nextNodeName, stats);
|
||||||
|
|
||||||
// if ignoreUndefined=false the miss is considered as undefined
|
double finalScore = stats.getFinalScore(currentNode.getAggregation());
|
||||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
if(finalScore == -1.0)
|
||||||
nextNodeName = currentNode.getUndefined();
|
nextNodeName = currentNode.getUndefined();
|
||||||
}
|
else if (finalScore >= currentNode.getThreshold()) {
|
||||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
|
||||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
|
||||||
nextNodeName = currentNode.getPositive();
|
nextNodeName = currentNode.getPositive();
|
||||||
} else {
|
} else {
|
||||||
nextNodeName = currentNode.getNegative();
|
nextNodeName = currentNode.getNegative();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
|
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
|
||||||
|
|
||||||
treeStats.setResult(MatchType.parse(nextNodeName));
|
treeStats.setResult(MatchType.parse(nextNodeName));
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.mongodb.connection.Cluster;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@ -177,41 +178,16 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testKeywordsClustering() {
|
public void legalnameClustering() {
|
||||||
|
|
||||||
final ClusteringFunction cf = new KeywordsClustering(params);
|
final ClusteringFunction cf = new LegalnameClustering(params);
|
||||||
final String s = "Polytechnic University of Turin";
|
String s = "key::1 key::2 city::1";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
final String s1 = "POLITECNICO DI TORINO";
|
s = "key::1 key::2 city::1 city::2";
|
||||||
System.out.println(s1);
|
System.out.println(s);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
|
||||||
System.out.println("s2 = " + s2);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
|
|
||||||
|
|
||||||
final String s3 = "universita universita milano milano";
|
|
||||||
System.out.println("s3 = " + s3);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
|
|
||||||
|
|
||||||
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
|
||||||
System.out.println("s4 = " + s4);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
|
|
||||||
|
|
||||||
final String s5 = "İstanbul Ticarət Universiteti";
|
|
||||||
System.out.println("s5 = " + s5);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
|
|
||||||
|
|
||||||
final String s6 = "National and Kapodistrian University of Athens";
|
|
||||||
System.out.println("s6 = " + s6);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
|
|
||||||
|
|
||||||
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
|
|
||||||
System.out.println("s7 = " + s7);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -54,4 +54,47 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
|
||||||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void countryInferenceTest() {
|
||||||
|
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
|
||||||
|
assertEquals("UK", countryInference("UK", "Università di Bologna"));
|
||||||
|
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
|
||||||
|
assertEquals("UNKNOWN", countryInference("UNKNOWN", "Università del Lavoro"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void cityInferenceTest() {
|
||||||
|
assertEquals("universita city::3181928", cityInference("Università di Bologna"));
|
||||||
|
assertEquals("university city::3170647", cityInference("University of Pisa"));
|
||||||
|
assertEquals("universita", cityInference("Università del lavoro"));
|
||||||
|
assertEquals("universita city::3173331 city::3169522", cityInference("Università di Modena e Reggio Emilia"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void keywordInferenceTest() {
|
||||||
|
assertEquals("key::41 turin", keywordInference("Polytechnic University of Turin"));
|
||||||
|
assertEquals("key::41 torino", keywordInference("POLITECNICO DI TORINO"));
|
||||||
|
assertEquals(
|
||||||
|
"key::1 key::60 key::81 milano bergamo",
|
||||||
|
keywordInference("Universita farmaceutica culturale di milano bergamo"));
|
||||||
|
assertEquals("key::1 key::1 milano milano", keywordInference("universita universita milano milano"));
|
||||||
|
assertEquals(
|
||||||
|
"key::10 kapodistriako panepistemio athenon",
|
||||||
|
keywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void cityKeywordInferenceTest() {
|
||||||
|
assertEquals("key::41 city::3165524", cityKeywordInference("Polytechnic University of Turin"));
|
||||||
|
assertEquals("key::41 city::3165524", cityKeywordInference("POLITECNICO DI TORINO"));
|
||||||
|
assertEquals(
|
||||||
|
"key::1 key::60 key::81 city::3173435 city::3182164",
|
||||||
|
cityKeywordInference("Universita farmaceutica culturale di milano bergamo"));
|
||||||
|
assertEquals(
|
||||||
|
"key::1 key::1 city::3173435 city::3173435", cityKeywordInference("universita universita milano milano"));
|
||||||
|
assertEquals(
|
||||||
|
"key::10 kapodistriako panepistemio city::264371",
|
||||||
|
cityKeywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,6 +35,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
params.put("name_th", "0.95");
|
params.put("name_th", "0.95");
|
||||||
params.put("jpath_value", "$.value");
|
params.put("jpath_value", "$.value");
|
||||||
params.put("jpath_classid", "$.qualifier.classid");
|
params.put("jpath_classid", "$.qualifier.classid");
|
||||||
|
params.put("codeRegex", "key::\\d+");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -44,52 +45,23 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void cityMatchTest() {
|
public void codeMatchTest() {
|
||||||
final CityMatch cityMatch = new CityMatch(params);
|
CodeMatch codeMatch = new CodeMatch(params);
|
||||||
|
|
||||||
// both names with no cities
|
// both names with no codes
|
||||||
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
|
assertEquals(1.0, codeMatch.distance("testing1", "testing2", conf));
|
||||||
|
|
||||||
// one of the two names with no cities
|
// one of the two names with no codes
|
||||||
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
|
assertEquals(-1.0, codeMatch.distance("testing1 key::1", "testing", conf));
|
||||||
|
|
||||||
// both names with cities (same)
|
// both names with codes (same)
|
||||||
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
|
assertEquals(1.0, codeMatch.distance("testing1 key::1", "testing2 key::1", conf));
|
||||||
|
|
||||||
// both names with cities (different)
|
// both names with codes (different)
|
||||||
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
|
assertEquals(0.0, codeMatch.distance("testing1 key::1", "testing2 key::2", conf));
|
||||||
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
|
|
||||||
|
|
||||||
// particular cases
|
// both names with codes (1 same, 1 different)
|
||||||
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
assertEquals(0.5,codeMatch.distance("key::1 key::2 testing1", "key::1 testing", conf));
|
||||||
assertEquals(
|
|
||||||
1.0,
|
|
||||||
cityMatch
|
|
||||||
.distance(
|
|
||||||
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
|
|
||||||
conf));
|
|
||||||
|
|
||||||
// failing becasuse 'Allen' is a transliterrated greek stopword
|
|
||||||
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
|
||||||
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void keywordMatchTest() {
|
|
||||||
params.put("threshold", "0.5");
|
|
||||||
|
|
||||||
final KeywordMatch keywordMatch = new KeywordMatch(params);
|
|
||||||
|
|
||||||
assertEquals(
|
|
||||||
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
|
||||||
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
|
|
||||||
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
|
|
||||||
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
|
||||||
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
|
||||||
assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
|
||||||
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
|
|
||||||
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
|
|
||||||
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,15 +127,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void jaroWinklerNormalizedNameTest() {
|
public void jaroWinklerLegalnameTest() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerLegalname jaroWinklerLegalname = new JaroWinklerLegalname(params);
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName
|
double result = jaroWinklerLegalname
|
||||||
.distance("AT&T (United States)", "United States Military Academy", conf);
|
.distance("AT&T (United States)", "United States key::2 key::1", conf);
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
|
|
||||||
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
result = jaroWinklerLegalname.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -344,13 +316,13 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
|
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
|
||||||
assertEquals(-1.0, result);
|
assertEquals(-1.0, result);
|
||||||
|
|
||||||
result = countryMatch.distance("CHILE", "UNKNOWN", conf);
|
result = countryMatch.distance("CL", "UNKNOWN", conf);
|
||||||
assertEquals(-1.0, result);
|
assertEquals(-1.0, result);
|
||||||
|
|
||||||
result = countryMatch.distance("CHILE", "ITALY", conf);
|
result = countryMatch.distance("CL", "IT", conf);
|
||||||
assertEquals(0.0, result);
|
assertEquals(0.0, result);
|
||||||
|
|
||||||
result = countryMatch.distance("CHILE", "CHILE", conf);
|
result = countryMatch.distance("CL", "CL", conf);
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -190,7 +190,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("orp_simrel = " + orp_simrel);
|
System.out.println("orp_simrel = " + orp_simrel);
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(751, orgs_simrel);
|
assertEquals(742, orgs_simrel);
|
||||||
assertEquals(566, pubs_simrel);
|
assertEquals(566, pubs_simrel);
|
||||||
assertEquals(113, sw_simrel);
|
assertEquals(113, sw_simrel);
|
||||||
assertEquals(148, ds_simrel);
|
assertEquals(148, ds_simrel);
|
||||||
|
@ -251,7 +251,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(751, orgs_simrel);
|
assertEquals(742, orgs_simrel);
|
||||||
assertEquals(566, pubs_simrel);
|
assertEquals(566, pubs_simrel);
|
||||||
assertEquals(148, ds_simrel);
|
assertEquals(148, ds_simrel);
|
||||||
assertEquals(280, orp_simrel);
|
assertEquals(280, orp_simrel);
|
||||||
|
@ -442,7 +442,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
final List<Relation> merges = pubs
|
final List<Relation> merges = pubs
|
||||||
.filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
.filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
||||||
.collectAsList();
|
.collectAsList();
|
||||||
assertEquals(3, merges.size());
|
assertEquals(1, merges.size());
|
||||||
Set<String> dups = Sets
|
Set<String> dups = Sets
|
||||||
.newHashSet(
|
.newHashSet(
|
||||||
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
|
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
|
||||||
|
@ -451,7 +451,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
merges.forEach(r -> {
|
merges.forEach(r -> {
|
||||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||||
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||||
assertTrue(dups.contains(r.getTarget()));
|
assertTrue(dups.contains(r.getTarget()));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -561,7 +561,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("orp_mergerel = " + orp_mergerel);
|
System.out.println("orp_mergerel = " + orp_mergerel);
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(1268, orgs_mergerel);
|
assertEquals(1278, orgs_mergerel);
|
||||||
assertEquals(1156, pubs.count());
|
assertEquals(1156, pubs.count());
|
||||||
assertEquals(292, sw_mergerel);
|
assertEquals(292, sw_mergerel);
|
||||||
assertEquals(476, ds_mergerel);
|
assertEquals(476, ds_mergerel);
|
||||||
|
@ -618,7 +618,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("orp_deduprecord = " + orp_deduprecord);
|
System.out.println("orp_deduprecord = " + orp_deduprecord);
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(86, orgs_deduprecord);
|
assertEquals(78, orgs_deduprecord);
|
||||||
assertEquals(96, pubs.count());
|
assertEquals(96, pubs.count());
|
||||||
assertEquals(47, sw_deduprecord);
|
assertEquals(47, sw_deduprecord);
|
||||||
assertEquals(97, ds_deduprecord);
|
assertEquals(97, ds_deduprecord);
|
||||||
|
@ -761,7 +761,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(930, publications);
|
assertEquals(930, publications);
|
||||||
assertEquals(839, organizations);
|
assertEquals(831, organizations);
|
||||||
assertEquals(100, projects);
|
assertEquals(100, projects);
|
||||||
assertEquals(100, datasource);
|
assertEquals(100, datasource);
|
||||||
assertEquals(196, softwares);
|
assertEquals(196, softwares);
|
||||||
|
|
|
@ -22,8 +22,11 @@ import java.util.Properties;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
@ -143,7 +146,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(86, orgs_simrel);
|
assertEquals(92, orgs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -172,7 +175,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(122, orgs_simrel);
|
assertEquals(128, orgs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -207,7 +210,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||||
.count();
|
.count();
|
||||||
assertEquals(132, orgs_mergerel);
|
assertEquals(128, orgs_mergerel);
|
||||||
|
|
||||||
// verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to
|
// verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to
|
||||||
// do)
|
// do)
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.dedup.jpath;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.dedup.SparkOpenorgsDedupTest;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.sql.Row;
|
import org.apache.spark.sql.Row;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
@ -24,6 +25,30 @@ class JsonPathTest {
|
||||||
|
|
||||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
|
||||||
|
System.out.println("row = " + row);
|
||||||
|
Assertions.assertNotNull(row);
|
||||||
|
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||||
|
|
||||||
|
System.out.println("row = " + row.getAs("countrytitle"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void jsonToModelTest() throws IOException{
|
||||||
|
DedupConfig conf = DedupConfig
|
||||||
|
.load(IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkOpenorgsDedupTest.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||||
|
|
||||||
|
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||||
|
|
||||||
|
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
// to check that the same parsing returns the same row
|
||||||
|
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
|
||||||
|
Assertions.assertEquals(row, row1);
|
||||||
|
System.out.println("row = " + row);
|
||||||
Assertions.assertNotNull(row);
|
Assertions.assertNotNull(row);
|
||||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,8 @@
|
||||||
"dedupRun" : "001",
|
"dedupRun" : "001",
|
||||||
"entityType" : "organization",
|
"entityType" : "organization",
|
||||||
"subEntityValue": "organization",
|
"subEntityValue": "organization",
|
||||||
"orderField" : "legalname",
|
"orderField" : "original_legalname",
|
||||||
"queueMaxSize" : "2000",
|
"queueMaxSize" : "100000",
|
||||||
"groupMaxSize" : "50",
|
"groupMaxSize" : "50",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"idPath":"$.id",
|
"idPath":"$.id",
|
||||||
|
@ -15,10 +15,10 @@
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
{ "name" : "sortedngrampairs", "fields" : [ "original_legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
{ "name" : "suffixprefix", "fields" : [ "original_legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
{ "name" : "legalnameclustering", "fields" : [ "legalname" ], "params" : { "max": 2} }
|
||||||
],
|
],
|
||||||
"decisionTree" : {
|
"decisionTree" : {
|
||||||
"start": {
|
"start": {
|
||||||
|
@ -29,16 +29,23 @@
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {}
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "rorid",
|
||||||
|
"comparator": "exactMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 1,
|
"threshold": 1,
|
||||||
"aggregation": "AVG",
|
"aggregation": "OR",
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer2",
|
"undefined": "necessaryConditions",
|
||||||
"ignoreUndefined": "false"
|
"ignoreUndefined": "false"
|
||||||
},
|
},
|
||||||
"layer2": {
|
"necessaryConditions": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "websiteurl",
|
"field": "websiteurl",
|
||||||
|
@ -55,14 +62,14 @@
|
||||||
"params": {}
|
"params": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "original_legalname",
|
||||||
"comparator": "numbersMatch",
|
"comparator": "numbersMatch",
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {}
|
"params": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "original_legalname",
|
||||||
"comparator": "romansMatch",
|
"comparator": "romansMatch",
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
|
@ -71,68 +78,64 @@
|
||||||
],
|
],
|
||||||
"threshold": 1,
|
"threshold": 1,
|
||||||
"aggregation": "AND",
|
"aggregation": "AND",
|
||||||
"positive": "layer3",
|
"positive": "cityCheck",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer3",
|
"undefined": "cityCheck",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"layer3": {
|
"cityCheck": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "legalname",
|
||||||
"comparator": "cityMatch",
|
"comparator": "codeMatch",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {
|
"params": {
|
||||||
"windowSize": "4"
|
"codeRegex": "city::\\d+"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.1,
|
"threshold": 0.1,
|
||||||
"aggregation": "AVG",
|
"aggregation": "AVG",
|
||||||
"positive": "layer4",
|
"positive": "keywordCheck",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "NO_MATCH",
|
"undefined": "NO_MATCH",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"layer4": {
|
"keywordCheck": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "legalname",
|
||||||
"comparator": "keywordMatch",
|
"comparator": "codeMatch",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {
|
"params": {
|
||||||
"windowSize": "4"
|
"codeRegex": "key::\\d+"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.7,
|
"threshold": 0.7,
|
||||||
"aggregation": "AVG",
|
"aggregation": "AVG",
|
||||||
"positive": "layer5",
|
"positive": "nameCheck",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer5",
|
"undefined": "nameCheck",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"layer5": {
|
"nameCheck": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "legalname",
|
||||||
"comparator": "jaroWinklerNormalizedName",
|
"comparator": "jaroWinklerLegalname",
|
||||||
"weight": 0.9,
|
"weight": 0.9,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {
|
"params": {}
|
||||||
"windowSize": "4"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "legalshortname",
|
"field": "legalshortname",
|
||||||
"comparator": "jaroWinklerNormalizedName",
|
"comparator": "jaroWinklerLegalname",
|
||||||
"weight": 0.1,
|
"weight": 0.1,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {
|
"params": {}
|
||||||
"windowSize": 4
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.9,
|
"threshold": 0.9,
|
||||||
|
@ -144,126 +147,16 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"model" : [
|
"model" : [
|
||||||
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
|
{ "name" : "country", "type" : "String", "path" : "$.country.classid", "infer" : "country", "inferenceFrom" : "$.legalname.value"},
|
||||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
|
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value", "infer" : "city_keyword"},
|
||||||
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
|
{ "name" : "original_legalname", "type" : "String", "path" : "$.legalname.value" },
|
||||||
|
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value", "infer" : "city_keyword"},
|
||||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
||||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
|
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
|
||||||
|
{ "name" : "rorid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='ROR')].value"},
|
||||||
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
||||||
],
|
],
|
||||||
"blacklists" : {
|
"blacklists" : {},
|
||||||
"legalname" : []
|
"synonyms": {}
|
||||||
},
|
|
||||||
"synonyms": {
|
|
||||||
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
|
||||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
|
||||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
|
||||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
|
||||||
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
|
|
||||||
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
|
|
||||||
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
|
|
||||||
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
|
||||||
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
|
||||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
|
||||||
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
|
||||||
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
|
||||||
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
|
||||||
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
|
|
||||||
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
|
|
||||||
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
|
|
||||||
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
|
|
||||||
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
|
|
||||||
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
|
|
||||||
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
|
|
||||||
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
|
|
||||||
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
|
|
||||||
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
|
|
||||||
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
|
|
||||||
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
|
|
||||||
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
|
|
||||||
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
|
|
||||||
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
|
|
||||||
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
|
|
||||||
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
|
|
||||||
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
|
|
||||||
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
|
|
||||||
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
|
|
||||||
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
|
|
||||||
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
|
|
||||||
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
|
|
||||||
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
|
|
||||||
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
|
|
||||||
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
|
|
||||||
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
|
|
||||||
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
|
|
||||||
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
|
|
||||||
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
|
|
||||||
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
|
|
||||||
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
|
|
||||||
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
|
|
||||||
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
|
|
||||||
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
|
|
||||||
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
|
|
||||||
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
|
|
||||||
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
|
|
||||||
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
|
|
||||||
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
|
|
||||||
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
|
|
||||||
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
|
|
||||||
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
|
|
||||||
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
|
|
||||||
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
|
|
||||||
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
|
|
||||||
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
|
|
||||||
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
|
|
||||||
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
|
|
||||||
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
|
|
||||||
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
|
|
||||||
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
|
|
||||||
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
|
|
||||||
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
|
|
||||||
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
|
|
||||||
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
|
|
||||||
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
|
|
||||||
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
|
|
||||||
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
|
|
||||||
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
|
|
||||||
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
|
|
||||||
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
|
|
||||||
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
|
|
||||||
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
|
|
||||||
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
|
|
||||||
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
|
|
||||||
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
|
|
||||||
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
|
|
||||||
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
|
|
||||||
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
|
|
||||||
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
|
|
||||||
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
|
|
||||||
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
|
|
||||||
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
|
|
||||||
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
|
|
||||||
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
|
|
||||||
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
|
|
||||||
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
|
|
||||||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
|
|
||||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
|
|
||||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
|
|
||||||
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
|
||||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
|
|
||||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
|
|
||||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
|
|
||||||
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
|
|
||||||
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
|
|
||||||
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
|
|
||||||
"key::102": ["informatics","informatica","informática","informática","informatica",""],
|
|
||||||
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
|
||||||
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
|
|
||||||
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
|
|
||||||
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
|
|
||||||
"key::107" : ["agricultural forestry", "af", "a f"],
|
|
||||||
"key::108" : ["agricultural mechanical", "am", "a m"],
|
|
||||||
"key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue