forked from D-Net/dnet-hadoop
optimization of normalization stage in openorgs workflow, implementation of new comparators replacing older versions, openorgs configuration update, addition of inference flag in model definition, new test classes
This commit is contained in:
parent
c7634c55c7
commit
2a36ccb997
|
@ -2,31 +2,41 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
@ClusteringClass("keywordsclustering")
|
@ClusteringClass("legalnameclustering")
|
||||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
public class LegalnameClustering extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public KeywordsClustering(Map<String, Object> params) {
|
private static final Pattern CITY_CODE_PATTERN = Pattern.compile("city::\\d+");
|
||||||
|
private static final Pattern KEYWORD_CODE_PATTERN = Pattern.compile("key::\\d+");
|
||||||
|
|
||||||
|
public LegalnameClustering(Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<String> getRegexList(String input, Pattern codeRegex) {
|
||||||
|
Matcher matcher = codeRegex.matcher(input);
|
||||||
|
Set<String> cities = new HashSet<>();
|
||||||
|
while (matcher.find()) {
|
||||||
|
cities.add(matcher.group());
|
||||||
|
}
|
||||||
|
return cities;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final Config conf, String s) {
|
protected Collection<String> doApply(final Config conf, String s) {
|
||||||
|
|
||||||
// takes city codes and keywords codes without duplicates
|
|
||||||
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
|
|
||||||
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
|
|
||||||
|
|
||||||
// list of combination to return as result
|
// list of combination to return as result
|
||||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||||
|
|
||||||
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
|
for (String keyword : getRegexList(s, KEYWORD_CODE_PATTERN)) {
|
||||||
for (String city : citiesToCodes(cities)) {
|
for (String city : getRegexList(s, CITY_CODE_PATTERN)) {
|
||||||
combinations.add(keyword + "-" + city);
|
combinations.add(keyword + "-" + city);
|
||||||
if (combinations.size() >= paramOrDefault("max", 2)) {
|
if (combinations.size() >= paramOrDefault("max", 2)) {
|
||||||
return combinations;
|
return combinations;
|
||||||
|
@ -42,9 +52,6 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(KeywordsClustering::cleanup)
|
|
||||||
.map(KeywordsClustering::normalize)
|
|
||||||
.map(s -> filterAllStopWords(s))
|
|
||||||
.map(s -> doApply(conf, s))
|
.map(s -> doApply(conf, s))
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
.flatMap(c -> c.stream())
|
.flatMap(c -> c.stream())
|
|
@ -27,6 +27,14 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||||
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||||
|
|
||||||
|
// keywords map to be used when translating the keyword names into codes
|
||||||
|
private static Map<String, String> keywordMap = AbstractPaceFunctions
|
||||||
|
.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||||
|
|
||||||
|
// country map to be used when inferring the country from the city name
|
||||||
|
private static Map<String, String> countryMap = AbstractPaceFunctions
|
||||||
|
.loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
|
||||||
|
|
||||||
// list of stopwords in different languages
|
// list of stopwords in different languages
|
||||||
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
||||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||||
|
@ -74,6 +82,64 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
return s12;
|
return s12;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String countryInference(final String original, String inferFrom) {
|
||||||
|
if (!original.equalsIgnoreCase("unknown"))
|
||||||
|
return original;
|
||||||
|
|
||||||
|
inferFrom = cleanup(inferFrom);
|
||||||
|
inferFrom = normalize(inferFrom);
|
||||||
|
inferFrom = filterAllStopWords(inferFrom);
|
||||||
|
Set<String> cities = getCities(inferFrom, 4);
|
||||||
|
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String cityInference(String original) {
|
||||||
|
original = cleanup(original);
|
||||||
|
original = normalize(original);
|
||||||
|
original = filterAllStopWords(original);
|
||||||
|
|
||||||
|
Set<String> cities = getCities(original, 4);
|
||||||
|
|
||||||
|
for (String city : cities) {
|
||||||
|
original = original.replaceAll(city, cityMap.get(city));
|
||||||
|
}
|
||||||
|
|
||||||
|
return original;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String keywordInference(String original) {
|
||||||
|
original = cleanup(original);
|
||||||
|
original = normalize(original);
|
||||||
|
original = filterAllStopWords(original);
|
||||||
|
|
||||||
|
Set<String> keywords = getKeywords(original, keywordMap, 4);
|
||||||
|
|
||||||
|
for (String keyword : keywords) {
|
||||||
|
original = original.replaceAll(keyword, keywordMap.get(keyword));
|
||||||
|
}
|
||||||
|
|
||||||
|
return original;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String cityKeywordInference(String original) {
|
||||||
|
original = cleanup(original);
|
||||||
|
original = normalize(original);
|
||||||
|
original = filterAllStopWords(original);
|
||||||
|
|
||||||
|
Set<String> keywords = getKeywords(original, keywordMap, 4);
|
||||||
|
Set<String> cities = getCities(original, 4);
|
||||||
|
|
||||||
|
for (String keyword : keywords) {
|
||||||
|
original = original.replaceAll(keyword, keywordMap.get(keyword));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String city : cities) {
|
||||||
|
original = original.replaceAll(city, cityMap.get(city));
|
||||||
|
}
|
||||||
|
|
||||||
|
return original;
|
||||||
|
}
|
||||||
|
|
||||||
protected static String fixXML(final String a) {
|
protected static String fixXML(final String a) {
|
||||||
|
|
||||||
return a
|
return a
|
||||||
|
@ -208,6 +274,30 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Map<String, String> loadCountryMapFromClasspath(final String classpath) {
|
||||||
|
|
||||||
|
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
|
||||||
|
final Map<String, String> m = new HashMap<>();
|
||||||
|
try {
|
||||||
|
for (final String s : IOUtils
|
||||||
|
.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||||
|
// string is like this: country_code;city1;city2;city3
|
||||||
|
String[] line = s.split(";");
|
||||||
|
String value = line[0];
|
||||||
|
for (int i = 1; i < line.length; i++) {
|
||||||
|
String city = fixAliases(transliterator.transliterate(line[i].toLowerCase()));
|
||||||
|
String code = cityMap.get(city);
|
||||||
|
m.put(code, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
return new HashMap<>();
|
||||||
|
}
|
||||||
|
return m;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public static String removeKeywords(String s, Set<String> keywords) {
|
public static String removeKeywords(String s, Set<String> keywords) {
|
||||||
|
|
||||||
s = " " + s + " ";
|
s = " " + s + " ";
|
||||||
|
@ -237,6 +327,10 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
return toCodes(keywords, cityMap);
|
return toCodes(keywords, cityMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Set<String> citiesToCountry(Set<String> cities) {
|
||||||
|
return toCodes(toCodes(cities, cityMap), countryMap);
|
||||||
|
}
|
||||||
|
|
||||||
protected static String firstLC(final String s) {
|
protected static String firstLC(final String s) {
|
||||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,9 +47,21 @@ public class FieldDef implements Serializable {
|
||||||
|
|
||||||
private String clean;
|
private String clean;
|
||||||
|
|
||||||
|
private String infer;
|
||||||
|
|
||||||
|
private String inferenceFrom;
|
||||||
|
|
||||||
public FieldDef() {
|
public FieldDef() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getInferenceFrom() {
|
||||||
|
return inferenceFrom;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setInferenceFrom(final String inferenceFrom) {
|
||||||
|
this.inferenceFrom = inferenceFrom;
|
||||||
|
}
|
||||||
|
|
||||||
public String getName() {
|
public String getName() {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
@ -126,6 +138,14 @@ public class FieldDef implements Serializable {
|
||||||
this.clean = clean;
|
this.clean = clean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getInfer() {
|
||||||
|
return infer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setInfer(String infer) {
|
||||||
|
this.infer = infer;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -123,9 +123,19 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
case _ => res(index)
|
case _ => res(index)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (StringUtils.isNotBlank(fdef.getInfer)) {
|
||||||
|
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
|
||||||
|
res(index) = res(index) match {
|
||||||
|
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
|
||||||
|
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
res
|
res
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
new GenericRowWithSchema(values, schema)
|
new GenericRowWithSchema(values, schema)
|
||||||
|
@ -146,5 +156,17 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def inference(value: String, inferfrom: String, infertype: String) : String = {
|
||||||
|
val res = infertype match {
|
||||||
|
case "country" => AbstractPaceFunctions.countryInference(value, inferfrom)
|
||||||
|
case "city" => AbstractPaceFunctions.cityInference(value)
|
||||||
|
case "keyword" => AbstractPaceFunctions.keywordInference(value)
|
||||||
|
case "city_keyword" => AbstractPaceFunctions.cityKeywordInference(value)
|
||||||
|
case _ => value
|
||||||
|
}
|
||||||
|
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,48 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
|
||||||
|
|
||||||
@ComparatorClass("cityMatch")
|
|
||||||
public class CityMatch extends AbstractStringComparator {
|
|
||||||
|
|
||||||
private Map<String, String> params;
|
|
||||||
|
|
||||||
public CityMatch(Map<String, String> params) {
|
|
||||||
super(params);
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
|
||||||
|
|
||||||
String ca = cleanup(a);
|
|
||||||
String cb = cleanup(b);
|
|
||||||
|
|
||||||
ca = normalize(ca);
|
|
||||||
cb = normalize(cb);
|
|
||||||
|
|
||||||
ca = filterAllStopWords(ca);
|
|
||||||
cb = filterAllStopWords(cb);
|
|
||||||
|
|
||||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
|
|
||||||
Set<String> codes1 = citiesToCodes(cities1);
|
|
||||||
Set<String> codes2 = citiesToCodes(cities2);
|
|
||||||
|
|
||||||
// if no cities are detected, the comparator gives 1.0
|
|
||||||
if (codes1.isEmpty() && codes2.isEmpty())
|
|
||||||
return 1.0;
|
|
||||||
else {
|
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
|
||||||
return -1; // undefined if one of the two has no cities
|
|
||||||
return commonElementsPercentage(codes1, codes2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ComparatorClass("codeMatch")
|
||||||
|
public class CodeMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
private Pattern CODE_REGEX;
|
||||||
|
|
||||||
|
public CodeMatch(Map<String, String> params) {
|
||||||
|
super(params);
|
||||||
|
this.params = params;
|
||||||
|
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getRegexList(String input) {
|
||||||
|
Matcher matcher = this.CODE_REGEX.matcher(input);
|
||||||
|
Set<String> cities = new HashSet<>();
|
||||||
|
while (matcher.find()) {
|
||||||
|
cities.add(matcher.group());
|
||||||
|
}
|
||||||
|
return cities;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
|
||||||
|
Set<String> codes1 = getRegexList(a);
|
||||||
|
Set<String> codes2 = getRegexList(b);
|
||||||
|
|
||||||
|
// if no codes are detected, the comparator gives 1.0
|
||||||
|
if (codes1.isEmpty() && codes2.isEmpty())
|
||||||
|
return 1.0;
|
||||||
|
else {
|
||||||
|
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||||
|
return -1; // undefined if one of the two has no codes
|
||||||
|
return commonElementsPercentage(codes1, codes2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
@ -11,8 +13,11 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
@ComparatorClass("countryMatch")
|
@ComparatorClass("countryMatch")
|
||||||
public class CountryMatch extends AbstractStringComparator {
|
public class CountryMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
|
private Map<String, String> params;
|
||||||
|
|
||||||
public CountryMatch(Map<String, String> params) {
|
public CountryMatch(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CountryMatch(final double weight) {
|
public CountryMatch(final double weight) {
|
||||||
|
@ -25,6 +30,7 @@ public class CountryMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
|
||||||
if (a.isEmpty() || b.isEmpty()) {
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
return -1.0; // return -1 if a field is missing
|
return -1.0; // return -1 if a field is missing
|
||||||
}
|
}
|
||||||
|
@ -44,4 +50,5 @@ public class CountryMatch extends AbstractStringComparator {
|
||||||
protected double normalize(final double d) {
|
protected double normalize(final double d) {
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ComparatorClass("jaroWinklerLegalname")
|
||||||
|
public class JaroWinklerLegalname extends AbstractStringComparator {
|
||||||
|
|
||||||
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
private final String CITY_CODE_REGEX = "city::\\d+";
|
||||||
|
private final String KEYWORD_CODE_REGEX = "key::\\d+";
|
||||||
|
|
||||||
|
public JaroWinklerLegalname(Map<String, String> params) {
|
||||||
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
|
public JaroWinklerLegalname(double weight) {
|
||||||
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected JaroWinklerLegalname(double weight, AbstractStringDistance ssalgo) {
|
||||||
|
super(weight, ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double distance(String a, String b, final Config conf) {
|
||||||
|
|
||||||
|
String ca = a.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
|
||||||
|
String cb = b.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
|
||||||
|
|
||||||
|
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||||
|
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||||
|
|
||||||
|
if (ca.isEmpty() && cb.isEmpty())
|
||||||
|
return 1.0;
|
||||||
|
else
|
||||||
|
return normalize(ssalgo.score(ca, cb));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double getWeight() {
|
||||||
|
return super.weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected double normalize(double d) {
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,74 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
|
||||||
|
|
||||||
@ComparatorClass("jaroWinklerNormalizedName")
|
|
||||||
public class JaroWinklerNormalizedName extends AbstractStringComparator {
|
|
||||||
|
|
||||||
private Map<String, String> params;
|
|
||||||
|
|
||||||
public JaroWinklerNormalizedName(Map<String, String> params) {
|
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
public JaroWinklerNormalizedName(double weight) {
|
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
|
||||||
}
|
|
||||||
|
|
||||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
|
||||||
super(weight, ssalgo);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double distance(String a, String b, final Config conf) {
|
|
||||||
String ca = cleanup(a);
|
|
||||||
String cb = cleanup(b);
|
|
||||||
|
|
||||||
ca = normalize(ca);
|
|
||||||
cb = normalize(cb);
|
|
||||||
|
|
||||||
ca = filterAllStopWords(ca);
|
|
||||||
cb = filterAllStopWords(cb);
|
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(
|
|
||||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
Set<String> keywords2 = getKeywords(
|
|
||||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
|
|
||||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
|
|
||||||
ca = removeKeywords(ca, keywords1);
|
|
||||||
ca = removeKeywords(ca, cities1);
|
|
||||||
cb = removeKeywords(cb, keywords2);
|
|
||||||
cb = removeKeywords(cb, cities2);
|
|
||||||
|
|
||||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
|
||||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
|
||||||
|
|
||||||
if (ca.isEmpty() && cb.isEmpty())
|
|
||||||
return 1.0;
|
|
||||||
else
|
|
||||||
return normalize(ssalgo.score(ca, cb));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double getWeight() {
|
|
||||||
return super.weight;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected double normalize(double d) {
|
|
||||||
return d;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,50 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
|
||||||
|
|
||||||
@ComparatorClass("keywordMatch")
|
|
||||||
public class KeywordMatch extends AbstractStringComparator {
|
|
||||||
|
|
||||||
Map<String, String> params;
|
|
||||||
|
|
||||||
public KeywordMatch(Map<String, String> params) {
|
|
||||||
super(params);
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
|
||||||
|
|
||||||
String ca = cleanup(a);
|
|
||||||
String cb = cleanup(b);
|
|
||||||
|
|
||||||
ca = normalize(ca);
|
|
||||||
cb = normalize(cb);
|
|
||||||
|
|
||||||
ca = filterAllStopWords(ca);
|
|
||||||
cb = filterAllStopWords(cb);
|
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(
|
|
||||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
Set<String> keywords2 = getKeywords(
|
|
||||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
|
||||||
|
|
||||||
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
|
||||||
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
|
||||||
|
|
||||||
// if no cities are detected, the comparator gives 1.0
|
|
||||||
if (codes1.isEmpty() && codes2.isEmpty())
|
|
||||||
return 1.0;
|
|
||||||
else {
|
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
|
||||||
return -1.0; // undefined if one of the two has no keywords
|
|
||||||
return commonElementsPercentage(codes1, codes2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
|
||||||
// function for the evaluation of the node
|
// function for the evaluation of the node
|
||||||
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||||
|
|
||||||
TreeNodeStats stats = new TreeNodeStats();
|
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
|
||||||
|
|
||||||
// for each field in the node, it computes the
|
// for each field in the node, it computes the
|
||||||
for (FieldConf fieldConf : fields) {
|
for (FieldConf fieldConf : fields) {
|
||||||
|
|
|
@ -9,8 +9,11 @@ public class TreeNodeStats implements Serializable {
|
||||||
|
|
||||||
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
||||||
|
|
||||||
public TreeNodeStats() {
|
private final boolean ignoreUndefined;
|
||||||
|
|
||||||
|
public TreeNodeStats(boolean ignoreUndefined) {
|
||||||
this.results = new HashMap<>();
|
this.results = new HashMap<>();
|
||||||
|
this.ignoreUndefined = ignoreUndefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, FieldStats> getResults() {
|
public Map<String, FieldStats> getResults() {
|
||||||
|
@ -22,7 +25,10 @@ public class TreeNodeStats implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int fieldsCount() {
|
public int fieldsCount() {
|
||||||
|
if(ignoreUndefined)
|
||||||
return this.results.size();
|
return this.results.size();
|
||||||
|
else
|
||||||
|
return this.results.size() - undefinedCount(); //do not count undefined
|
||||||
}
|
}
|
||||||
|
|
||||||
public int undefinedCount() {
|
public int undefinedCount() {
|
||||||
|
@ -78,12 +84,26 @@ public class TreeNodeStats implements Serializable {
|
||||||
double min = 100.0; // random high value
|
double min = 100.0; // random high value
|
||||||
for (FieldStats fs : this.results.values()) {
|
for (FieldStats fs : this.results.values()) {
|
||||||
if (fs.getResult() < min) {
|
if (fs.getResult() < min) {
|
||||||
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
if (fs.getResult() == -1) {
|
||||||
|
if (fs.isCountIfUndefined()) {
|
||||||
|
min = 0.0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
min = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
min = fs.getResult();
|
min = fs.getResult();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (ignoreUndefined) {
|
||||||
|
return min==-1.0? 0.0 : min;
|
||||||
|
}
|
||||||
|
else {
|
||||||
return min;
|
return min;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// if at least one is true, return 1.0
|
// if at least one is true, return 1.0
|
||||||
public double or() {
|
public double or() {
|
||||||
|
@ -91,8 +111,12 @@ public class TreeNodeStats implements Serializable {
|
||||||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||||
return 1.0;
|
return 1.0;
|
||||||
}
|
}
|
||||||
|
if (!ignoreUndefined && undefinedCount()>0){
|
||||||
|
return -1.0;
|
||||||
|
} else {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// if at least one is false, return 0.0
|
// if at least one is false, return 0.0
|
||||||
public double and() {
|
public double and() {
|
||||||
|
@ -100,7 +124,7 @@ public class TreeNodeStats implements Serializable {
|
||||||
|
|
||||||
if (fieldStats.getResult() == -1) {
|
if (fieldStats.getResult() == -1) {
|
||||||
if (fieldStats.isCountIfUndefined())
|
if (fieldStats.isCountIfUndefined())
|
||||||
return 0.0;
|
return ignoreUndefined? 0.0 : -1.0;
|
||||||
} else {
|
} else {
|
||||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
|
@ -44,17 +44,16 @@ public class TreeProcessor {
|
||||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||||
treeStats.addNodeStats(nextNodeName, stats);
|
treeStats.addNodeStats(nextNodeName, stats);
|
||||||
|
|
||||||
// if ignoreUndefined=false the miss is considered as undefined
|
double finalScore = stats.getFinalScore(currentNode.getAggregation());
|
||||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
if(finalScore == -1.0)
|
||||||
nextNodeName = currentNode.getUndefined();
|
nextNodeName = currentNode.getUndefined();
|
||||||
}
|
else if (finalScore >= currentNode.getThreshold()) {
|
||||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
|
||||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
|
||||||
nextNodeName = currentNode.getPositive();
|
nextNodeName = currentNode.getPositive();
|
||||||
} else {
|
} else {
|
||||||
nextNodeName = currentNode.getNegative();
|
nextNodeName = currentNode.getNegative();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
|
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
|
||||||
|
|
||||||
treeStats.setResult(MatchType.parse(nextNodeName));
|
treeStats.setResult(MatchType.parse(nextNodeName));
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.mongodb.connection.Cluster;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@ -177,41 +178,16 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testKeywordsClustering() {
|
public void legalnameClustering() {
|
||||||
|
|
||||||
final ClusteringFunction cf = new KeywordsClustering(params);
|
final ClusteringFunction cf = new LegalnameClustering(params);
|
||||||
final String s = "Polytechnic University of Turin";
|
String s = "key::1 key::2 city::1";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
final String s1 = "POLITECNICO DI TORINO";
|
s = "key::1 key::2 city::1 city::2";
|
||||||
System.out.println(s1);
|
System.out.println(s);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
|
||||||
System.out.println("s2 = " + s2);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
|
|
||||||
|
|
||||||
final String s3 = "universita universita milano milano";
|
|
||||||
System.out.println("s3 = " + s3);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
|
|
||||||
|
|
||||||
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
|
||||||
System.out.println("s4 = " + s4);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
|
|
||||||
|
|
||||||
final String s5 = "İstanbul Ticarət Universiteti";
|
|
||||||
System.out.println("s5 = " + s5);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
|
|
||||||
|
|
||||||
final String s6 = "National and Kapodistrian University of Athens";
|
|
||||||
System.out.println("s6 = " + s6);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
|
|
||||||
|
|
||||||
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
|
|
||||||
System.out.println("s7 = " + s7);
|
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -54,4 +54,47 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
|
||||||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void countryInferenceTest() {
|
||||||
|
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
|
||||||
|
assertEquals("UK", countryInference("UK", "Università di Bologna"));
|
||||||
|
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
|
||||||
|
assertEquals("UNKNOWN", countryInference("UNKNOWN", "Università del Lavoro"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void cityInferenceTest() {
|
||||||
|
assertEquals("universita city::3181928", cityInference("Università di Bologna"));
|
||||||
|
assertEquals("university city::3170647", cityInference("University of Pisa"));
|
||||||
|
assertEquals("universita", cityInference("Università del lavoro"));
|
||||||
|
assertEquals("universita city::3173331 city::3169522", cityInference("Università di Modena e Reggio Emilia"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void keywordInferenceTest() {
|
||||||
|
assertEquals("key::41 turin", keywordInference("Polytechnic University of Turin"));
|
||||||
|
assertEquals("key::41 torino", keywordInference("POLITECNICO DI TORINO"));
|
||||||
|
assertEquals(
|
||||||
|
"key::1 key::60 key::81 milano bergamo",
|
||||||
|
keywordInference("Universita farmaceutica culturale di milano bergamo"));
|
||||||
|
assertEquals("key::1 key::1 milano milano", keywordInference("universita universita milano milano"));
|
||||||
|
assertEquals(
|
||||||
|
"key::10 kapodistriako panepistemio athenon",
|
||||||
|
keywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void cityKeywordInferenceTest() {
|
||||||
|
assertEquals("key::41 city::3165524", cityKeywordInference("Polytechnic University of Turin"));
|
||||||
|
assertEquals("key::41 city::3165524", cityKeywordInference("POLITECNICO DI TORINO"));
|
||||||
|
assertEquals(
|
||||||
|
"key::1 key::60 key::81 city::3173435 city::3182164",
|
||||||
|
cityKeywordInference("Universita farmaceutica culturale di milano bergamo"));
|
||||||
|
assertEquals(
|
||||||
|
"key::1 key::1 city::3173435 city::3173435", cityKeywordInference("universita universita milano milano"));
|
||||||
|
assertEquals(
|
||||||
|
"key::10 kapodistriako panepistemio city::264371",
|
||||||
|
cityKeywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,6 +35,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
params.put("name_th", "0.95");
|
params.put("name_th", "0.95");
|
||||||
params.put("jpath_value", "$.value");
|
params.put("jpath_value", "$.value");
|
||||||
params.put("jpath_classid", "$.qualifier.classid");
|
params.put("jpath_classid", "$.qualifier.classid");
|
||||||
|
params.put("codeRegex", "key::\\d+");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -44,52 +45,23 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void cityMatchTest() {
|
public void codeMatchTest() {
|
||||||
final CityMatch cityMatch = new CityMatch(params);
|
CodeMatch codeMatch = new CodeMatch(params);
|
||||||
|
|
||||||
// both names with no cities
|
// both names with no codes
|
||||||
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
|
assertEquals(1.0, codeMatch.distance("testing1", "testing2", conf));
|
||||||
|
|
||||||
// one of the two names with no cities
|
// one of the two names with no codes
|
||||||
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
|
assertEquals(-1.0, codeMatch.distance("testing1 key::1", "testing", conf));
|
||||||
|
|
||||||
// both names with cities (same)
|
// both names with codes (same)
|
||||||
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
|
assertEquals(1.0, codeMatch.distance("testing1 key::1", "testing2 key::1", conf));
|
||||||
|
|
||||||
// both names with cities (different)
|
// both names with codes (different)
|
||||||
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
|
assertEquals(0.0, codeMatch.distance("testing1 key::1", "testing2 key::2", conf));
|
||||||
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
|
|
||||||
|
|
||||||
// particular cases
|
// both names with codes (1 same, 1 different)
|
||||||
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
assertEquals(0.5,codeMatch.distance("key::1 key::2 testing1", "key::1 testing", conf));
|
||||||
assertEquals(
|
|
||||||
1.0,
|
|
||||||
cityMatch
|
|
||||||
.distance(
|
|
||||||
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
|
|
||||||
conf));
|
|
||||||
|
|
||||||
// failing becasuse 'Allen' is a transliterrated greek stopword
|
|
||||||
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
|
||||||
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void keywordMatchTest() {
|
|
||||||
params.put("threshold", "0.5");
|
|
||||||
|
|
||||||
final KeywordMatch keywordMatch = new KeywordMatch(params);
|
|
||||||
|
|
||||||
assertEquals(
|
|
||||||
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
|
||||||
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
|
|
||||||
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
|
|
||||||
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
|
||||||
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
|
||||||
assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
|
||||||
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
|
|
||||||
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
|
|
||||||
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,15 +127,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void jaroWinklerNormalizedNameTest() {
|
public void jaroWinklerLegalnameTest() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerLegalname jaroWinklerLegalname = new JaroWinklerLegalname(params);
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName
|
double result = jaroWinklerLegalname
|
||||||
.distance("AT&T (United States)", "United States Military Academy", conf);
|
.distance("AT&T (United States)", "United States key::2 key::1", conf);
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
|
|
||||||
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
result = jaroWinklerLegalname.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -344,13 +316,13 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
|
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
|
||||||
assertEquals(-1.0, result);
|
assertEquals(-1.0, result);
|
||||||
|
|
||||||
result = countryMatch.distance("CHILE", "UNKNOWN", conf);
|
result = countryMatch.distance("CL", "UNKNOWN", conf);
|
||||||
assertEquals(-1.0, result);
|
assertEquals(-1.0, result);
|
||||||
|
|
||||||
result = countryMatch.distance("CHILE", "ITALY", conf);
|
result = countryMatch.distance("CL", "IT", conf);
|
||||||
assertEquals(0.0, result);
|
assertEquals(0.0, result);
|
||||||
|
|
||||||
result = countryMatch.distance("CHILE", "CHILE", conf);
|
result = countryMatch.distance("CL", "CL", conf);
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -190,7 +190,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("orp_simrel = " + orp_simrel);
|
System.out.println("orp_simrel = " + orp_simrel);
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(751, orgs_simrel);
|
assertEquals(742, orgs_simrel);
|
||||||
assertEquals(566, pubs_simrel);
|
assertEquals(566, pubs_simrel);
|
||||||
assertEquals(113, sw_simrel);
|
assertEquals(113, sw_simrel);
|
||||||
assertEquals(148, ds_simrel);
|
assertEquals(148, ds_simrel);
|
||||||
|
@ -251,7 +251,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(751, orgs_simrel);
|
assertEquals(742, orgs_simrel);
|
||||||
assertEquals(566, pubs_simrel);
|
assertEquals(566, pubs_simrel);
|
||||||
assertEquals(148, ds_simrel);
|
assertEquals(148, ds_simrel);
|
||||||
assertEquals(280, orp_simrel);
|
assertEquals(280, orp_simrel);
|
||||||
|
@ -442,7 +442,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
final List<Relation> merges = pubs
|
final List<Relation> merges = pubs
|
||||||
.filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
.filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
||||||
.collectAsList();
|
.collectAsList();
|
||||||
assertEquals(3, merges.size());
|
assertEquals(1, merges.size());
|
||||||
Set<String> dups = Sets
|
Set<String> dups = Sets
|
||||||
.newHashSet(
|
.newHashSet(
|
||||||
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
|
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
|
||||||
|
@ -451,7 +451,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
merges.forEach(r -> {
|
merges.forEach(r -> {
|
||||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||||
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||||
assertTrue(dups.contains(r.getTarget()));
|
assertTrue(dups.contains(r.getTarget()));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -561,7 +561,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("orp_mergerel = " + orp_mergerel);
|
System.out.println("orp_mergerel = " + orp_mergerel);
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(1268, orgs_mergerel);
|
assertEquals(1278, orgs_mergerel);
|
||||||
assertEquals(1156, pubs.count());
|
assertEquals(1156, pubs.count());
|
||||||
assertEquals(292, sw_mergerel);
|
assertEquals(292, sw_mergerel);
|
||||||
assertEquals(476, ds_mergerel);
|
assertEquals(476, ds_mergerel);
|
||||||
|
@ -618,7 +618,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("orp_deduprecord = " + orp_deduprecord);
|
System.out.println("orp_deduprecord = " + orp_deduprecord);
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(86, orgs_deduprecord);
|
assertEquals(78, orgs_deduprecord);
|
||||||
assertEquals(96, pubs.count());
|
assertEquals(96, pubs.count());
|
||||||
assertEquals(47, sw_deduprecord);
|
assertEquals(47, sw_deduprecord);
|
||||||
assertEquals(97, ds_deduprecord);
|
assertEquals(97, ds_deduprecord);
|
||||||
|
@ -761,7 +761,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
if (CHECK_CARDINALITIES) {
|
if (CHECK_CARDINALITIES) {
|
||||||
assertEquals(930, publications);
|
assertEquals(930, publications);
|
||||||
assertEquals(839, organizations);
|
assertEquals(831, organizations);
|
||||||
assertEquals(100, projects);
|
assertEquals(100, projects);
|
||||||
assertEquals(100, datasource);
|
assertEquals(100, datasource);
|
||||||
assertEquals(196, softwares);
|
assertEquals(196, softwares);
|
||||||
|
|
|
@ -22,8 +22,11 @@ import java.util.Properties;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
@ -143,7 +146,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(86, orgs_simrel);
|
assertEquals(92, orgs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -172,7 +175,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(122, orgs_simrel);
|
assertEquals(128, orgs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -207,7 +210,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||||
.count();
|
.count();
|
||||||
assertEquals(132, orgs_mergerel);
|
assertEquals(128, orgs_mergerel);
|
||||||
|
|
||||||
// verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to
|
// verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to
|
||||||
// do)
|
// do)
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.dedup.jpath;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.dedup.SparkOpenorgsDedupTest;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.sql.Row;
|
import org.apache.spark.sql.Row;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
@ -24,6 +25,30 @@ class JsonPathTest {
|
||||||
|
|
||||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
|
||||||
|
System.out.println("row = " + row);
|
||||||
|
Assertions.assertNotNull(row);
|
||||||
|
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||||
|
|
||||||
|
System.out.println("row = " + row.getAs("countrytitle"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void jsonToModelTest() throws IOException{
|
||||||
|
DedupConfig conf = DedupConfig
|
||||||
|
.load(IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkOpenorgsDedupTest.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||||
|
|
||||||
|
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||||
|
|
||||||
|
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
// to check that the same parsing returns the same row
|
||||||
|
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||||
|
|
||||||
|
Assertions.assertEquals(row, row1);
|
||||||
|
System.out.println("row = " + row);
|
||||||
Assertions.assertNotNull(row);
|
Assertions.assertNotNull(row);
|
||||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,8 @@
|
||||||
"dedupRun" : "001",
|
"dedupRun" : "001",
|
||||||
"entityType" : "organization",
|
"entityType" : "organization",
|
||||||
"subEntityValue": "organization",
|
"subEntityValue": "organization",
|
||||||
"orderField" : "legalname",
|
"orderField" : "original_legalname",
|
||||||
"queueMaxSize" : "2000",
|
"queueMaxSize" : "100000",
|
||||||
"groupMaxSize" : "50",
|
"groupMaxSize" : "50",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"idPath":"$.id",
|
"idPath":"$.id",
|
||||||
|
@ -15,10 +15,10 @@
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
{ "name" : "sortedngrampairs", "fields" : [ "original_legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
{ "name" : "suffixprefix", "fields" : [ "original_legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
{ "name" : "legalnameclustering", "fields" : [ "legalname" ], "params" : { "max": 2} }
|
||||||
],
|
],
|
||||||
"decisionTree" : {
|
"decisionTree" : {
|
||||||
"start": {
|
"start": {
|
||||||
|
@ -29,16 +29,23 @@
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {}
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "rorid",
|
||||||
|
"comparator": "exactMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 1,
|
"threshold": 1,
|
||||||
"aggregation": "AVG",
|
"aggregation": "OR",
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer2",
|
"undefined": "necessaryConditions",
|
||||||
"ignoreUndefined": "false"
|
"ignoreUndefined": "false"
|
||||||
},
|
},
|
||||||
"layer2": {
|
"necessaryConditions": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "websiteurl",
|
"field": "websiteurl",
|
||||||
|
@ -55,14 +62,14 @@
|
||||||
"params": {}
|
"params": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "original_legalname",
|
||||||
"comparator": "numbersMatch",
|
"comparator": "numbersMatch",
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {}
|
"params": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "original_legalname",
|
||||||
"comparator": "romansMatch",
|
"comparator": "romansMatch",
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
|
@ -71,68 +78,64 @@
|
||||||
],
|
],
|
||||||
"threshold": 1,
|
"threshold": 1,
|
||||||
"aggregation": "AND",
|
"aggregation": "AND",
|
||||||
"positive": "layer3",
|
"positive": "cityCheck",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer3",
|
"undefined": "cityCheck",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"layer3": {
|
"cityCheck": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "legalname",
|
||||||
"comparator": "cityMatch",
|
"comparator": "codeMatch",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {
|
"params": {
|
||||||
"windowSize": "4"
|
"codeRegex": "city::\\d+"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.1,
|
"threshold": 0.1,
|
||||||
"aggregation": "AVG",
|
"aggregation": "AVG",
|
||||||
"positive": "layer4",
|
"positive": "keywordCheck",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "NO_MATCH",
|
"undefined": "NO_MATCH",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"layer4": {
|
"keywordCheck": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "legalname",
|
||||||
"comparator": "keywordMatch",
|
"comparator": "codeMatch",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {
|
"params": {
|
||||||
"windowSize": "4"
|
"codeRegex": "key::\\d+"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.7,
|
"threshold": 0.7,
|
||||||
"aggregation": "AVG",
|
"aggregation": "AVG",
|
||||||
"positive": "layer5",
|
"positive": "nameCheck",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer5",
|
"undefined": "nameCheck",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"layer5": {
|
"nameCheck": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "legalname",
|
||||||
"comparator": "jaroWinklerNormalizedName",
|
"comparator": "jaroWinklerLegalname",
|
||||||
"weight": 0.9,
|
"weight": 0.9,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {
|
"params": {}
|
||||||
"windowSize": "4"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "legalshortname",
|
"field": "legalshortname",
|
||||||
"comparator": "jaroWinklerNormalizedName",
|
"comparator": "jaroWinklerLegalname",
|
||||||
"weight": 0.1,
|
"weight": 0.1,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {
|
"params": {}
|
||||||
"windowSize": 4
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.9,
|
"threshold": 0.9,
|
||||||
|
@ -144,126 +147,16 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"model" : [
|
"model" : [
|
||||||
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
|
{ "name" : "country", "type" : "String", "path" : "$.country.classid", "infer" : "country", "inferenceFrom" : "$.legalname.value"},
|
||||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
|
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value", "infer" : "city_keyword"},
|
||||||
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
|
{ "name" : "original_legalname", "type" : "String", "path" : "$.legalname.value" },
|
||||||
|
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value", "infer" : "city_keyword"},
|
||||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
||||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
|
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
|
||||||
|
{ "name" : "rorid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='ROR')].value"},
|
||||||
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
||||||
],
|
],
|
||||||
"blacklists" : {
|
"blacklists" : {},
|
||||||
"legalname" : []
|
"synonyms": {}
|
||||||
},
|
|
||||||
"synonyms": {
|
|
||||||
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
|
||||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
|
||||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
|
||||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
|
||||||
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
|
|
||||||
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
|
|
||||||
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
|
|
||||||
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
|
||||||
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
|
||||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
|
||||||
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
|
||||||
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
|
||||||
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
|
||||||
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
|
|
||||||
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
|
|
||||||
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
|
|
||||||
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
|
|
||||||
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
|
|
||||||
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
|
|
||||||
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
|
|
||||||
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
|
|
||||||
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
|
|
||||||
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
|
|
||||||
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
|
|
||||||
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
|
|
||||||
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
|
|
||||||
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
|
|
||||||
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
|
|
||||||
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
|
|
||||||
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
|
|
||||||
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
|
|
||||||
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
|
|
||||||
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
|
|
||||||
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
|
|
||||||
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
|
|
||||||
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
|
|
||||||
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
|
|
||||||
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
|
|
||||||
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
|
|
||||||
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
|
|
||||||
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
|
|
||||||
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
|
|
||||||
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
|
|
||||||
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
|
|
||||||
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
|
|
||||||
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
|
|
||||||
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
|
|
||||||
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
|
|
||||||
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
|
|
||||||
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
|
|
||||||
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
|
|
||||||
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
|
|
||||||
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
|
|
||||||
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
|
|
||||||
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
|
|
||||||
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
|
|
||||||
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
|
|
||||||
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
|
|
||||||
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
|
|
||||||
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
|
|
||||||
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
|
|
||||||
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
|
|
||||||
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
|
|
||||||
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
|
|
||||||
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
|
|
||||||
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
|
|
||||||
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
|
|
||||||
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
|
|
||||||
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
|
|
||||||
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
|
|
||||||
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
|
|
||||||
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
|
|
||||||
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
|
|
||||||
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
|
|
||||||
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
|
|
||||||
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
|
|
||||||
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
|
|
||||||
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
|
|
||||||
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
|
|
||||||
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
|
|
||||||
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
|
|
||||||
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
|
|
||||||
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
|
|
||||||
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
|
|
||||||
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
|
|
||||||
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
|
|
||||||
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
|
|
||||||
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
|
|
||||||
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
|
|
||||||
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
|
|
||||||
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
|
|
||||||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
|
|
||||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
|
|
||||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
|
|
||||||
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
|
||||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
|
|
||||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
|
|
||||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
|
|
||||||
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
|
|
||||||
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
|
|
||||||
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
|
|
||||||
"key::102": ["informatics","informatica","informática","informática","informatica",""],
|
|
||||||
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
|
||||||
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
|
|
||||||
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
|
|
||||||
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
|
|
||||||
"key::107" : ["agricultural forestry", "af", "a f"],
|
|
||||||
"key::108" : ["agricultural mechanical", "am", "a m"],
|
|
||||||
"key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue