1
0
Fork 0

restyling of the JaroWinklerNormalizedName comparator, now it is optimized. Addition of some translations in the translation maps, addition of a clustering based on keywords in organizations legalnames

This commit is contained in:
miconis 2019-07-19 17:10:29 +02:00
parent fb5e38db26
commit a85576c27e
7 changed files with 87 additions and 112 deletions

View File

@ -1,6 +1,5 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import com.google.common.base.Joiner;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import java.util.*; import java.util.*;
@ -20,14 +19,14 @@ public class KeywordsClustering extends AbstractClusteringFunction {
protected Collection<String> doApply(String s) { protected Collection<String> doApply(String s) {
//takes city codes and keywords codes without duplicates //takes city codes and keywords codes without duplicates
Set<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4)); Set<String> keywords = getKeywords(s, params.getOrDefault("windowSize", 4));
Set<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4)); Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
//list of combination to return as result //list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>(); final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword: keywords){ for (String keyword: keywordsToCodes(keywords)){
for (String city: cities) { for (String city: citiesToCodes(cities)) {
combinations.add(keyword+"-"+city); combinations.add(keyword+"-"+city);
if (combinations.size()>=params.getOrDefault("max", 2)) { if (combinations.size()>=params.getOrDefault("max", 2)) {
return combinations; return combinations;

View File

@ -18,6 +18,8 @@ import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/** /**
* Set of common functions * Set of common functions
@ -27,6 +29,10 @@ import java.util.regex.Pattern;
*/ */
public abstract class AbstractPaceFunctions { public abstract class AbstractPaceFunctions {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
@ -212,99 +218,58 @@ public abstract class AbstractPaceFunctions {
return sb.toString().trim(); return sb.toString().trim();
} }
public String keywordsToCode(String s1, Map<String, String> translationMap, int windowSize){ public String removeKeywords(String s, Set<String> keywords) {
List<String> tokens = Arrays.asList(s1.split(" ")); s = " " + s + " ";
for (String k: keywords ) {
if (tokens.size()<windowSize) s = s.replaceAll(k.toLowerCase(), "");
windowSize = tokens.size();
int length = windowSize;
while (length != 0) {
for (int i = 0; i<=tokens.size()-length; i++){
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
if (translationMap.containsKey(candidate)) {
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + translationMap.get(candidate) + " ");
}
}
length-=1;
} }
return s1; return s.trim();
} }
public String removeCodes(String s) { public double keywordsCompare(Set<String> s1, Set<String> s2){
final String regexKey = "\\bkey::[0-9]*\\b";
final String regexCity = "\\bcity::[0-9]*\\b";
return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim();
}
public double keywordsCompare(String s1, String s2){ Set<String> k1 = keywordsToCodes(s1);
Set<String> k2 = keywordsToCodes(s2);
List<String> keywords1 = getKeywords(s1); int longer = (k1.size()>k2.size())?k1.size():k2.size();
List<String> keywords2 = getKeywords(s2);
int longer = (keywords1.size()>keywords2.size())?keywords1.size():keywords2.size();
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty()) if (k1.isEmpty() || k2.isEmpty())
return 1.0; return 1.0;
else else
return (double)CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()/(double)longer; return (double)CollectionUtils.intersection(k1,k2).size()/(double)longer;
}
//check if 2 strings have same keywords
public boolean sameKeywords(String s1, String s2){
//at least 1 keyword in common
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
return true;
else
return CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()>0;
} }
//returns true if at least 1 city is in common //returns true if at least 1 city is in common
//returns true if a name has no cities //returns true if no cities are contained in names
public boolean sameCity(String s1, String s2){ //returns false if one of the two names have no city
public boolean sameCity(Set<String> s1, Set<String> s2){
if (getCities(s1).isEmpty() || getCities(s2).isEmpty()) Set<String> c1 = citiesToCodes(s1);
Set<String> c2 = citiesToCodes(s2);
if (c1.isEmpty() && c2.isEmpty())
return true; return true;
else else {
return CollectionUtils.intersection(getCities(s1), getCities(s2)).size()>0; if (c1.isEmpty() ^ c2.isEmpty())
} return false;
return CollectionUtils.intersection(c1, c2).size() > 0;
//get the list of keywords in a string
public List<String> getCities(String s) {
final String regex = "\\bcity::[0-9]*\\b";
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
Matcher m = p.matcher(s);
List<String> codes = new ArrayList<>();
while (m.find()) {
codes.add(m.group(0));
for (int i = 1; i <= m.groupCount(); i++) {
codes.add(m.group(0));
} }
} }
return codes;
//convert the set of keywords to codes
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
} }
//get the list of keywords in a string public Set<String> keywordsToCodes(Set<String> keywords) {
public List<String> getKeywords(String s) { return toCodes(keywords, translationMap);
final String regex = "\\bkey::[0-9]*\\b";
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
Matcher m = p.matcher(s);
List<String> codes = new ArrayList<>();
while (m.find()) {
codes.add(m.group(0));
for (int i = 1; i <= m.groupCount(); i++) {
codes.add(m.group(0));
} }
}
return codes; public Set<String> citiesToCodes(Set<String> keywords) {
return toCodes(keywords, cityMap);
} }
protected String firstLC(final String s) { protected String firstLC(final String s) {
@ -320,7 +285,7 @@ public abstract class AbstractPaceFunctions {
} }
//get the list of codes into the input string //get the list of codes into the input string
public Set<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){ public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
String s = cleanup(s1); String s = cleanup(s1);
@ -340,7 +305,7 @@ public abstract class AbstractPaceFunctions {
for (int i = 0; i<=tokens.size()-length; i++){ for (int i = 0; i<=tokens.size()-length; i++){
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length)); String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
if (translationMap.containsKey(candidate)) { if (translationMap.containsKey(candidate)) {
codes.add(translationMap.get(candidate)); codes.add(candidate);
s = s.replace(candidate, ""); s = s.replace(candidate, "");
} }
} }
@ -352,4 +317,12 @@ public abstract class AbstractPaceFunctions {
return codes; return codes;
} }
public Set<String> getKeywords(String s1, int windowSize) {
return getKeywords(s1, translationMap, windowSize);
}
public Set<String> getCities(String s1, int windowSize) {
return getKeywords(s1, cityMap, windowSize);
}
} }

View File

@ -11,18 +11,6 @@ import java.util.Set;
@DistanceClass("JaroWinklerNormalizedName") @DistanceClass("JaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
private static Set<String> stopwordsEn = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
private static Set<String> stopwordsIt = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
private static Set<String> stopwordsDe = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
private static Set<String> stopwordsFr = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
private static Set<String> stopwordsPt = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
private static Set<String> stopwordsEs = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
//key=word, value=global identifier => example: "università"->"university", used to substitute the word with the global identifier
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
private Map<String, Number> params; private Map<String, Number> params;
public JaroWinklerNormalizedName(Map<String, Number> params){ public JaroWinklerNormalizedName(Map<String, Number> params){
@ -49,28 +37,30 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
ca = filterAllStopWords(ca); ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb); cb = filterAllStopWords(cb);
//replace keywords with codes Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue()); Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
String codesB = keywordsToCode(cb, translationMap, params.getOrDefault("windowSize",4).intValue());
//replace cities with codes Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
codesA = keywordsToCode(codesA, cityMap, params.getOrDefault("windowSize", 4).intValue()); Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
codesB = keywordsToCode(codesB, cityMap, params.getOrDefault("windowSize", 4).intValue());
if (sameCity(cities1,cities2)) {
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
//if two names have same city
if (sameCity(codesA,codesB)){
if (keywordsCompare(codesA, codesB)>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeCodes(codesA);
cb = removeCodes(codesB);
if (ca.isEmpty() && cb.isEmpty()) if (ca.isEmpty() && cb.isEmpty())
return 1.0; return 1.0;
else else
return normalize(ssalgo.score(ca,cb)); return normalize(ssalgo.score(ca,cb));
} }
} }
return 0.0; return 0.0;
} }
@Override @Override

View File

@ -6192,7 +6192,7 @@ city::753142;Zoliborz;Zalborz;Zalbórz;
city::753866;Zamosc;Gorad Zamasc';Zamosc;Zamosc';Zamosca;Zamoscia;Zamose;Zamoshc;Zamoshch;Zamost'ye;Zamoste;Zamostja;Zamosts;Zamostė;Zamostye;Zamość;jamosichi;sa mxchch;zamoshichi;zamostsi;zha mo xi qi;zmwsz;Ζάμοστς;Горад Замасць;Замостя;Замосць;Замошч;Замошћ;Զամոշչ;זמושץ;ซามอชช์;ზამოსცი;ザモシチ;扎莫希奇;자모시치; city::753866;Zamosc;Gorad Zamasc';Zamosc;Zamosc';Zamosca;Zamoscia;Zamose;Zamoshc;Zamoshch;Zamost'ye;Zamoste;Zamostja;Zamosts;Zamostė;Zamostye;Zamość;jamosichi;sa mxchch;zamoshichi;zamostsi;zha mo xi qi;zmwsz;Ζάμοστς;Горад Замасць;Замостя;Замосць;Замошч;Замошћ;Զամոշչ;זמושץ;ซามอชช์;ზამოსცი;ザモシチ;扎莫希奇;자모시치;
city::755330;Wola;Volja;Воля; city::755330;Wola;Volja;Воля;
city::756092;Wawer;; city::756092;Wawer;;
city::756135;Warsaw;Barsobia;Varsa;Varsava;Varsavia;Varsavja;Varshava;Varshavae;Varsja;Varsjá;Varso;Varsova;Varsovia;Varsovia - Warszawa;Varsovie;Varsovio;Varssavi;Varsuva;Varsòvia;Varsó;Varsóvia;Varşova;Varšava;Varšuva;Varșovia;Vársá;WAW;Warsaw;Warsawa;Warschau;Warskou;Warszaw;Warszawa;Waršawa;baleusyaba;hua sha;varshava;vorso;warsw;warushawa;wrsh;wrshw;wrsw;wxrsx;Βαρσοβία;Варшавæ;Варшава;Վարշավա;ווארשע;ורשה;װאַרשע;وارسو;ورشو;ۋارشاۋا;ܘܪܣܘ;वॉर्सो;วอร์ซอ;ვარშავა;ዋርሶው;ワルシャワ;华沙;華沙;바르샤바; city::756135;Warsaw;warszawie;Barsobia;Varsa;Varsava;Varsavia;Varsavja;Varshava;Varshavae;Varsja;Varsjá;Varso;Varsova;Varsovia;Varsovia - Warszawa;Varsovie;Varsovio;Varssavi;Varsuva;Varsòvia;Varsó;Varsóvia;Varşova;Varšava;Varšuva;Varșovia;Vársá;WAW;Warsaw;Warsawa;Warschau;Warskou;Warszaw;Warszawa;Waršawa;baleusyaba;hua sha;varshava;vorso;warsw;warushawa;wrsh;wrshw;wrsw;wxrsx;Βαρσοβία;Варшавæ;Варшава;Վարշավա;ווארשע;ורשה;װאַרשע;وارسو;ورشو;ۋارشاۋا;ܘܪܣܘ;वॉर्सो;วอร์ซอ;ვარშავა;ዋርሶው;ワルシャワ;华沙;華沙;바르샤바;warszawskiej;warszawska;
city::756867;Tomaszow Mazowiecki;Mazovijos Tomasuvas;Mazovijos Tomašuvas;Thomasovia;Tomashov Mazovecki;Tomashov Mazovjecki;Tomashuv-Mazovec'kij;Tomashuv-Mazovecki;Tomashuv-Mazoveckij;Tomasova Mazovecka;Tomaszow;Tomaszow Mazowiecki;Tomaszów;Tomaszów Mazowiecki;Tomašova Mazovecka;ma zuo fu she de qu tuo ma shu fu;twmswb mzwbyyzqy;tx ma chuf ma sx weiyt ski;Томашов Мазовецки;Томашов Мазовјецки;Томашув-Мазовецки;Томашув-Мазовецкий;Томашув-Мазовецький;טומשוב מזובייצקי;ตอมาชูฟมาซอเวียตสกี;馬佐夫舍地區托馬舒夫; city::756867;Tomaszow Mazowiecki;Mazovijos Tomasuvas;Mazovijos Tomašuvas;Thomasovia;Tomashov Mazovecki;Tomashov Mazovjecki;Tomashuv-Mazovec'kij;Tomashuv-Mazovecki;Tomashuv-Mazoveckij;Tomasova Mazovecka;Tomaszow;Tomaszow Mazowiecki;Tomaszów;Tomaszów Mazowiecki;Tomašova Mazovecka;ma zuo fu she de qu tuo ma shu fu;twmswb mzwbyyzqy;tx ma chuf ma sx weiyt ski;Томашов Мазовецки;Томашов Мазовјецки;Томашув-Мазовецки;Томашув-Мазовецкий;Томашув-Мазовецький;טומשוב מזובייצקי;ตอมาชูฟมาซอเวียตสกี;馬佐夫舍地區托馬舒夫;
city::757026;Tarnow;Gorad Tarnuu;Tarnov;Tarnova;Tarnow;Tarnuv;Tarnuvas;Tarnuw;Tarnów;Tarnůw;ta er nu fu;taleunupeu;tarnwf;tarunufu;trnwb;Горад Тарнуў;Тарнов;Тарнув;טארנע;טרנוב;تارنوف;タルヌフ;塔爾努夫;타르누프; city::757026;Tarnow;Gorad Tarnuu;Tarnov;Tarnova;Tarnow;Tarnuv;Tarnuvas;Tarnuw;Tarnów;Tarnůw;ta er nu fu;taleunupeu;tarnwf;tarunufu;trnwb;Горад Тарнуў;Тарнов;Тарнув;טארנע;טרנוב;تارنوف;タルヌフ;塔爾努夫;타르누프;
city::757033;Tarnobrzeg;Gorad Tarnobzhag;Nova Tarnovia;QEP;Tarnobjeg;Tarnobrzeg;Tarnobrzyg;Tarnobzega;Tarnobzegas;Tarnobzheg;Tarnobzhege;Tarnobžega;Tarnobžegas;ta er nuo bu re ge;taleunobeujekeu;tarnwbzk;tarunobujeku;trnwbzg;Горад Тарнобжаг;Тарнобжег;Тарнобжеге;דזשיקאוו;טרנובזג;تارنوبزک;タルノブジェク;塔尔诺布热格;타르노브제크; city::757033;Tarnobrzeg;Gorad Tarnobzhag;Nova Tarnovia;QEP;Tarnobjeg;Tarnobrzeg;Tarnobrzyg;Tarnobzega;Tarnobzegas;Tarnobzheg;Tarnobzhege;Tarnobžega;Tarnobžegas;ta er nuo bu re ge;taleunobeujekeu;tarnwbzk;tarunobujeku;trnwbzg;Горад Тарнобжаг;Тарнобжег;Тарнобжеге;דזשיקאוו;טרנובזג;تارنوبزک;タルノブジェク;塔尔诺布热格;타르노브제크;
@ -6229,7 +6229,7 @@ city::3080165;Zielona Gora;Gorad Zjaljona-Gura;Gruentberg;Grunberg;Grünberg;IEG
city::3080251;Zgierz;Gorad Zgezh;Zgeza;Zgezas;Zgezh;Zgeža;Zgežas;Zgierz;Zgjezh;Znkies;ci gai ri;jeugiesi;sex keiyr ch;zgyyz;zugyeshi;Ζγκιες;Горад Згеж;Згеж;Згјеж;זגייז;เซอเกียร์ช;ズギェシ;兹盖日;즈기에시; city::3080251;Zgierz;Gorad Zgezh;Zgeza;Zgezas;Zgezh;Zgeža;Zgežas;Zgierz;Zgjezh;Znkies;ci gai ri;jeugiesi;sex keiyr ch;zgyyz;zugyeshi;Ζγκιες;Горад Згеж;Згеж;Згјеж;זגייז;เซอเกียร์ช;ズギェシ;兹盖日;즈기에시;
city::3080526;Zawiercie;Zaverce;Zaverche;Zavercis;Zavertse;Zaviercis;Zavjerce;Zawiercie;zha wei er qie;zwwyyrzh;Заверце;Заверче;Завјерће;זוויירצה;扎维尔切; city::3080526;Zawiercie;Zaverce;Zaverche;Zavercis;Zavertse;Zaviercis;Zavjerce;Zawiercie;zha wei er qie;zwwyyrzh;Заверце;Заверче;Завјерће;זוויירצה;扎维尔切;
city::3080985;Zabrze;Gorad Zabzheh;Hindenburg;Zabje;Zaborze;Zabrze;Zabzas;Zabze;Zabzhe;Zabžas;Zabže;Zobrze;jabeuje;sab che;zabjh;zabuje;zabzhh;zha bu re;zʼbzh;Горад Забжэ;Забже;Զաբժե;זאבזה;زابجه;زابژه;زبرزے;ซาบเช;ザブジェ;扎布热;자브제; city::3080985;Zabrze;Gorad Zabzheh;Hindenburg;Zabje;Zaborze;Zabrze;Zabzas;Zabze;Zabzhe;Zabžas;Zabže;Zobrze;jabeuje;sab che;zabjh;zabuje;zabzhh;zha bu re;zʼbzh;Горад Забжэ;Забже;Զաբժե;זאבזה;زابجه;زابژه;زبرزے;ซาบเช;ザブジェ;扎布热;자브제;
city::3081368;Wroclaw;Brassel;Breslau;Breslavia;Breslavl';Breslavl;Gorad Vroclau;Vratislav;Vratislavia;Vroclav;Vroclava;Vroclavas;Vroclavo;Vroklave;Vroslav;Vrotslav;WRO;Wroclaw;Wroclow;Wrocław;Wrocłow;Wroklaw;Wroslaw;Wrosław;Wrócław;beulocheuwapeu;frwtswaf;fu luo ci wa fu;viratscahp;vrotsavapha;vrotslavi;vurotsuwafu;w rxtswaf;wrwslaw;wrwtswaf;wrwzlb;Βρότσλαβ;Вроцлав;Горад Вроцлаў;ברעסלוי;ורוצלב;فروتسواف;وروتسواف;وروسلاو;ڤرۆتسواف;व्रोत्सवाफ;விராத்ஸ்சாஃப்;วรอตสวัฟ;ვროცლავი;ヴロツワフ;弗罗茨瓦夫;브로츠와프; city::3081368;Wroclaw;Brassel;Breslau;Breslavia;Breslavl';Breslavl;Gorad Vroclau;Vratislav;Vratislavia;Vroclav;Vroclava;Vroclavas;Vroclavo;Vroklave;Vroslav;Vrotslav;WRO;Wroclaw;Wroclow;Wrocław;Wrocłow;Wroklaw;Wroslaw;Wrosław;Wrócław;beulocheuwapeu;frwtswaf;fu luo ci wa fu;viratscahp;vrotsavapha;vrotslavi;vurotsuwafu;w rxtswaf;wrwslaw;wrwtswaf;wrwzlb;Βρότσλαβ;Вроцлав;Горад Вроцлаў;ברעסלוי;ורוצלב;فروتسواف;وروتسواف;وروسلاو;ڤرۆتسواف;व्रोत्सवाफ;விராத்ஸ்சாஃப்;วรอตสวัฟ;ვროცლავი;ヴロツワフ;弗罗茨瓦夫;브로츠와프;Wrocławska;
city::3081741;Wloclawek;Gorad Ulaclavak;Leslau;Vloclavek;Vloclaveka;Vloclavekas;Wloclawek;Włocławek;beuwocheuwabekeu;fu wo ci wa wei ke;fwtswafk;vuu~otsuwavu~eku;w wxts wa wek;Влоцлавек;Горад Улацлавак;ולוצלאווק;فوتسوافك;ووتس‌واوک;ววอตซวาเวก;ヴウォツワヴェク;弗沃茨瓦韦克;브워츠와베크; city::3081741;Wloclawek;Gorad Ulaclavak;Leslau;Vloclavek;Vloclaveka;Vloclavekas;Wloclawek;Włocławek;beuwocheuwabekeu;fu wo ci wa wei ke;fwtswafk;vuu~otsuwavu~eku;w wxts wa wek;Влоцлавек;Горад Улацлавак;ולוצלאווק;فوتسوافك;ووتس‌واوک;ววอตซวาเวก;ヴウォツワヴェク;弗沃茨瓦韦克;브워츠와베크;
city::3082707;Walbrzych;Gorad Valbzhykh;Valbzhikh;Valbziha;Valbzychas;Valbžiha;Valbžychas;Walbrzych;Waldenburg;Waldenburg in Schlesien;Wałbrzych;baubeujiheu;vu~aubujifu;wa lb cik;wa wu bu ri he;wawb jykh;wawb zhykh;wlbzyk;Валбжих;Горад Валбжых;ולבזיך;واوب جيخ;واوب ژیخ;วาลบ์จิก;ヴァウブジフ;瓦烏布日赫;바우브지흐; city::3082707;Walbrzych;Gorad Valbzhykh;Valbzhikh;Valbziha;Valbzychas;Valbžiha;Valbžychas;Walbrzych;Waldenburg;Waldenburg in Schlesien;Wałbrzych;baubeujiheu;vu~aubujifu;wa lb cik;wa wu bu ri he;wawb jykh;wawb zhykh;wlbzyk;Валбжих;Горад Валбжых;ולבזיך;واوب جيخ;واوب ژیخ;วาลบ์จิก;ヴァウブジフ;瓦烏布日赫;바우브지흐;
city::3082914;Tychy;Tichau;Tihi;Tikhi;Tychos;Tychy;Tykhy;Tıhı;di hei;thi khi;tihi;tixa;tyshy;Тихи;Тыхы;تیشی;ทิคี;ティヒ;蒂黑;티히; city::3082914;Tychy;Tichau;Tihi;Tikhi;Tychos;Tychy;Tykhy;Tıhı;di hei;thi khi;tihi;tixa;tyshy;Тихи;Тыхы;تیشی;ทิคี;ティヒ;蒂黑;티히;

Can't render this file because it is too large.

View File

@ -1,6 +1,6 @@
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza;
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology;
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
@ -47,13 +47,13 @@ key::46;division;divisione;division;отделение;divisie;τμήμα
key::47;committee;comitato;comité;комитет;commissie;επιτροπή key::47;committee;comitato;comité;комитет;commissie;επιτροπή
key::48;promotion;promozione;продвижение;proothisis;forderung key::48;promotion;promozione;продвижение;proothisis;forderung
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline; key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline; key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik; key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri; key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus; key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia; key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia;
key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik; key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik;
key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon; key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon;informatycznych;
key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne; key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne;
key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;; key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri; key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
@ -71,7 +71,7 @@ key::70;veterinary;veterinaria;veterinarie;veterinaria;veterinária;tierärtzlic
key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia; key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia;
key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek; key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek;
key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia; key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia;
key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar; key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar;wojskowa;
key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline; key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline;
key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika; key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika;
key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus; key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus;

1 key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
2 key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
3 key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza;
4 key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
5 key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
6 key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
38 key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
39 key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
40 key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
41 key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology;
42 key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
43 key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
44 key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
47 key::47;committee;comitato;comité;комитет;commissie;επιτροπή
48 key::48;promotion;promozione;продвижение;proothisis;forderung
49 key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
50 key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline; key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;
51 key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
52 key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
53 key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
54 key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia;
55 key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik;
56 key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon; key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon;informatycznych;
57 key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne;
58 key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
59 key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
71 key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia;
72 key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek;
73 key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia;
74 key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar; key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar;wojskowa;
75 key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline;
76 key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika;
77 key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus;

View File

@ -137,6 +137,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println("s3 = " + s3); System.out.println("s3 = " + s3);
System.out.println(cf.apply(Lists.newArrayList(title(s3)))); System.out.println(cf.apply(Lists.newArrayList(title(s3))));
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
System.out.println("s4 = " + s4);
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
} }
} }

View File

@ -51,7 +51,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State"); double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State");
System.out.println("result = " + result); System.out.println("result = " + result);
assertEquals(1.0, result); assertEquals(0.0, result);
} }
@Test @Test
@ -114,4 +114,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
assertTrue(result> 0.9); assertTrue(result> 0.9);
} }
@Test
public void testJaroWinklerNormalizedName8() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology");
System.out.println("result = " + result);
}
} }