forked from D-Net/dnet-hadoop
restyling of the JaroWinklerNormalizedName comparator, now it is optimized. Addition of some translations in the translation maps, addition of a clustering based on keywords in organizations legalnames
This commit is contained in:
parent
fb5e38db26
commit
a85576c27e
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -20,14 +19,14 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
|
|
||||||
//takes city codes and keywords codes without duplicates
|
//takes city codes and keywords codes without duplicates
|
||||||
Set<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
|
Set<String> keywords = getKeywords(s, params.getOrDefault("windowSize", 4));
|
||||||
Set<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
|
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
||||||
|
|
||||||
//list of combination to return as result
|
//list of combination to return as result
|
||||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||||
|
|
||||||
for (String keyword: keywords){
|
for (String keyword: keywordsToCodes(keywords)){
|
||||||
for (String city: cities) {
|
for (String city: citiesToCodes(cities)) {
|
||||||
combinations.add(keyword+"-"+city);
|
combinations.add(keyword+"-"+city);
|
||||||
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
||||||
return combinations;
|
return combinations;
|
||||||
|
|
|
@ -18,6 +18,8 @@ import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of common functions
|
* Set of common functions
|
||||||
|
@ -27,6 +29,10 @@ import java.util.regex.Pattern;
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractPaceFunctions {
|
public abstract class AbstractPaceFunctions {
|
||||||
|
|
||||||
|
|
||||||
|
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||||
|
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||||
|
|
||||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||||
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
||||||
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
||||||
|
@ -212,99 +218,58 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString().trim();
|
return sb.toString().trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String keywordsToCode(String s1, Map<String, String> translationMap, int windowSize){
|
public String removeKeywords(String s, Set<String> keywords) {
|
||||||
|
|
||||||
List<String> tokens = Arrays.asList(s1.split(" "));
|
s = " " + s + " ";
|
||||||
|
for (String k: keywords ) {
|
||||||
if (tokens.size()<windowSize)
|
s = s.replaceAll(k.toLowerCase(), "");
|
||||||
windowSize = tokens.size();
|
|
||||||
|
|
||||||
int length = windowSize;
|
|
||||||
|
|
||||||
while (length != 0) {
|
|
||||||
|
|
||||||
for (int i = 0; i<=tokens.size()-length; i++){
|
|
||||||
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
|
|
||||||
if (translationMap.containsKey(candidate)) {
|
|
||||||
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + translationMap.get(candidate) + " ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
length-=1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return s1;
|
return s.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String removeCodes(String s) {
|
public double keywordsCompare(Set<String> s1, Set<String> s2){
|
||||||
final String regexKey = "\\bkey::[0-9]*\\b";
|
|
||||||
final String regexCity = "\\bcity::[0-9]*\\b";
|
|
||||||
return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
public double keywordsCompare(String s1, String s2){
|
Set<String> k1 = keywordsToCodes(s1);
|
||||||
|
Set<String> k2 = keywordsToCodes(s2);
|
||||||
|
|
||||||
List<String> keywords1 = getKeywords(s1);
|
int longer = (k1.size()>k2.size())?k1.size():k2.size();
|
||||||
List<String> keywords2 = getKeywords(s2);
|
|
||||||
int longer = (keywords1.size()>keywords2.size())?keywords1.size():keywords2.size();
|
|
||||||
|
|
||||||
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
|
if (k1.isEmpty() || k2.isEmpty())
|
||||||
return 1.0;
|
return 1.0;
|
||||||
else
|
else
|
||||||
return (double)CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()/(double)longer;
|
return (double)CollectionUtils.intersection(k1,k2).size()/(double)longer;
|
||||||
}
|
|
||||||
|
|
||||||
//check if 2 strings have same keywords
|
|
||||||
public boolean sameKeywords(String s1, String s2){
|
|
||||||
//at least 1 keyword in common
|
|
||||||
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
|
|
||||||
return true;
|
|
||||||
else
|
|
||||||
return CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()>0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//returns true if at least 1 city is in common
|
//returns true if at least 1 city is in common
|
||||||
//returns true if a name has no cities
|
//returns true if no cities are contained in names
|
||||||
public boolean sameCity(String s1, String s2){
|
//returns false if one of the two names have no city
|
||||||
|
public boolean sameCity(Set<String> s1, Set<String> s2){
|
||||||
|
|
||||||
if (getCities(s1).isEmpty() || getCities(s2).isEmpty())
|
Set<String> c1 = citiesToCodes(s1);
|
||||||
|
Set<String> c2 = citiesToCodes(s2);
|
||||||
|
|
||||||
|
if (c1.isEmpty() && c2.isEmpty())
|
||||||
return true;
|
return true;
|
||||||
else
|
else {
|
||||||
return CollectionUtils.intersection(getCities(s1), getCities(s2)).size()>0;
|
if (c1.isEmpty() ^ c2.isEmpty())
|
||||||
}
|
return false;
|
||||||
|
return CollectionUtils.intersection(c1, c2).size() > 0;
|
||||||
//get the list of keywords in a string
|
|
||||||
public List<String> getCities(String s) {
|
|
||||||
|
|
||||||
final String regex = "\\bcity::[0-9]*\\b";
|
|
||||||
|
|
||||||
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
|
|
||||||
Matcher m = p.matcher(s);
|
|
||||||
List<String> codes = new ArrayList<>();
|
|
||||||
while (m.find()) {
|
|
||||||
codes.add(m.group(0));
|
|
||||||
for (int i = 1; i <= m.groupCount(); i++) {
|
|
||||||
codes.add(m.group(0));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return codes;
|
|
||||||
|
//convert the set of keywords to codes
|
||||||
|
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||||
|
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||||
}
|
}
|
||||||
|
|
||||||
//get the list of keywords in a string
|
public Set<String> keywordsToCodes(Set<String> keywords) {
|
||||||
public List<String> getKeywords(String s) {
|
return toCodes(keywords, translationMap);
|
||||||
|
|
||||||
final String regex = "\\bkey::[0-9]*\\b";
|
|
||||||
|
|
||||||
Pattern p = Pattern.compile(regex, Pattern.MULTILINE);
|
|
||||||
Matcher m = p.matcher(s);
|
|
||||||
List<String> codes = new ArrayList<>();
|
|
||||||
while (m.find()) {
|
|
||||||
codes.add(m.group(0));
|
|
||||||
for (int i = 1; i <= m.groupCount(); i++) {
|
|
||||||
codes.add(m.group(0));
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return codes;
|
public Set<String> citiesToCodes(Set<String> keywords) {
|
||||||
|
return toCodes(keywords, cityMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String firstLC(final String s) {
|
protected String firstLC(final String s) {
|
||||||
|
@ -320,7 +285,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
//get the list of codes into the input string
|
//get the list of codes into the input string
|
||||||
public Set<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){
|
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
|
||||||
|
|
||||||
String s = cleanup(s1);
|
String s = cleanup(s1);
|
||||||
|
|
||||||
|
@ -340,7 +305,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
for (int i = 0; i<=tokens.size()-length; i++){
|
for (int i = 0; i<=tokens.size()-length; i++){
|
||||||
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
|
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
|
||||||
if (translationMap.containsKey(candidate)) {
|
if (translationMap.containsKey(candidate)) {
|
||||||
codes.add(translationMap.get(candidate));
|
codes.add(candidate);
|
||||||
s = s.replace(candidate, "");
|
s = s.replace(candidate, "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -352,4 +317,12 @@ public abstract class AbstractPaceFunctions {
|
||||||
return codes;
|
return codes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<String> getKeywords(String s1, int windowSize) {
|
||||||
|
return getKeywords(s1, translationMap, windowSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getCities(String s1, int windowSize) {
|
||||||
|
return getKeywords(s1, cityMap, windowSize);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,18 +11,6 @@ import java.util.Set;
|
||||||
@DistanceClass("JaroWinklerNormalizedName")
|
@DistanceClass("JaroWinklerNormalizedName")
|
||||||
public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
private static Set<String> stopwordsEn = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
|
||||||
private static Set<String> stopwordsIt = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
|
||||||
private static Set<String> stopwordsDe = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
|
||||||
private static Set<String> stopwordsFr = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
|
|
||||||
private static Set<String> stopwordsPt = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
|
||||||
private static Set<String> stopwordsEs = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
|
||||||
|
|
||||||
//key=word, value=global identifier => example: "università"->"university", used to substitute the word with the global identifier
|
|
||||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
|
||||||
|
|
||||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, Number> params;
|
||||||
|
|
||||||
public JaroWinklerNormalizedName(Map<String, Number> params){
|
public JaroWinklerNormalizedName(Map<String, Number> params){
|
||||||
|
@ -49,28 +37,30 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
ca = filterAllStopWords(ca);
|
ca = filterAllStopWords(ca);
|
||||||
cb = filterAllStopWords(cb);
|
cb = filterAllStopWords(cb);
|
||||||
|
|
||||||
//replace keywords with codes
|
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||||
String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||||
String codesB = keywordsToCode(cb, translationMap, params.getOrDefault("windowSize",4).intValue());
|
|
||||||
|
|
||||||
//replace cities with codes
|
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||||
codesA = keywordsToCode(codesA, cityMap, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||||
codesB = keywordsToCode(codesB, cityMap, params.getOrDefault("windowSize", 4).intValue());
|
|
||||||
|
if (sameCity(cities1,cities2)) {
|
||||||
|
|
||||||
|
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||||
|
|
||||||
|
ca = removeKeywords(ca, keywords1);
|
||||||
|
ca = removeKeywords(ca, cities1);
|
||||||
|
cb = removeKeywords(cb, keywords2);
|
||||||
|
cb = removeKeywords(cb, cities2);
|
||||||
|
|
||||||
//if two names have same city
|
|
||||||
if (sameCity(codesA,codesB)){
|
|
||||||
if (keywordsCompare(codesA, codesB)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
|
||||||
ca = removeCodes(codesA);
|
|
||||||
cb = removeCodes(codesB);
|
|
||||||
if (ca.isEmpty() && cb.isEmpty())
|
if (ca.isEmpty() && cb.isEmpty())
|
||||||
return 1.0;
|
return 1.0;
|
||||||
else
|
else
|
||||||
return normalize(ssalgo.score(ca,cb));
|
return normalize(ssalgo.score(ca,cb));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -6192,7 +6192,7 @@ city::753142;Zoliborz;Zalborz;Zalbórz;
|
||||||
city::753866;Zamosc;Gorad Zamasc';Zamosc;Zamosc';Zamosca;Zamoscia;Zamose;Zamoshc;Zamoshch;Zamost'ye;Zamoste;Zamostja;Zamosts;Zamostė;Zamost’ye;Zamość;jamosichi;sa mxchch;zamoshichi;zamostsi;zha mo xi qi;zmwsz;Ζάμοστς;Горад Замасць;Замостя;Замосць;Замошч;Замошћ;Զամոշչ;זמושץ;ซามอชช์;ზამოსცი;ザモシチ;扎莫希奇;자모시치;
|
city::753866;Zamosc;Gorad Zamasc';Zamosc;Zamosc';Zamosca;Zamoscia;Zamose;Zamoshc;Zamoshch;Zamost'ye;Zamoste;Zamostja;Zamosts;Zamostė;Zamost’ye;Zamość;jamosichi;sa mxchch;zamoshichi;zamostsi;zha mo xi qi;zmwsz;Ζάμοστς;Горад Замасць;Замостя;Замосць;Замошч;Замошћ;Զամոշչ;זמושץ;ซามอชช์;ზამოსცი;ザモシチ;扎莫希奇;자모시치;
|
||||||
city::755330;Wola;Volja;Воля;
|
city::755330;Wola;Volja;Воля;
|
||||||
city::756092;Wawer;;
|
city::756092;Wawer;;
|
||||||
city::756135;Warsaw;Barsobia;Varsa;Varsava;Varsavia;Varsavja;Varshava;Varshavae;Varsja;Varsjá;Varso;Varsova;Varsovia;Varsovia - Warszawa;Varsovie;Varsovio;Varssavi;Varsuva;Varsòvia;Varsó;Varsóvia;Varşova;Varšava;Varšuva;Varșovia;Vársá;WAW;Warsaw;Warsawa;Warschau;Warskou;Warszaw;Warszawa;Waršawa;baleusyaba;hua sha;varshava;vorso;warsw;warushawa;wrsh;wrshw;wrsw;wxrsx;Βαρσοβία;Варшавæ;Варшава;Վարշավա;ווארשע;ורשה;װאַרשע;وارسو;ورشو;ۋارشاۋا;ܘܪܣܘ;वॉर्सो;วอร์ซอ;ვარშავა;ዋርሶው;ワルシャワ;华沙;華沙;바르샤바;
|
city::756135;Warsaw;warszawie;Barsobia;Varsa;Varsava;Varsavia;Varsavja;Varshava;Varshavae;Varsja;Varsjá;Varso;Varsova;Varsovia;Varsovia - Warszawa;Varsovie;Varsovio;Varssavi;Varsuva;Varsòvia;Varsó;Varsóvia;Varşova;Varšava;Varšuva;Varșovia;Vársá;WAW;Warsaw;Warsawa;Warschau;Warskou;Warszaw;Warszawa;Waršawa;baleusyaba;hua sha;varshava;vorso;warsw;warushawa;wrsh;wrshw;wrsw;wxrsx;Βαρσοβία;Варшавæ;Варшава;Վարշավա;ווארשע;ורשה;װאַרשע;وارسو;ورشو;ۋارشاۋا;ܘܪܣܘ;वॉर्सो;วอร์ซอ;ვარშავა;ዋርሶው;ワルシャワ;华沙;華沙;바르샤바;warszawskiej;warszawska;
|
||||||
city::756867;Tomaszow Mazowiecki;Mazovijos Tomasuvas;Mazovijos Tomašuvas;Thomasovia;Tomashov Mazovecki;Tomashov Mazovjecki;Tomashuv-Mazovec'kij;Tomashuv-Mazovecki;Tomashuv-Mazoveckij;Tomasova Mazovecka;Tomaszow;Tomaszow Mazowiecki;Tomaszów;Tomaszów Mazowiecki;Tomašova Mazovecka;ma zuo fu she de qu tuo ma shu fu;twmswb mzwbyyzqy;tx ma chuf ma sx weiyt ski;Томашов Мазовецки;Томашов Мазовјецки;Томашув-Мазовецки;Томашув-Мазовецкий;Томашув-Мазовецький;טומשוב מזובייצקי;ตอมาชูฟมาซอเวียตสกี;馬佐夫舍地區托馬舒夫;
|
city::756867;Tomaszow Mazowiecki;Mazovijos Tomasuvas;Mazovijos Tomašuvas;Thomasovia;Tomashov Mazovecki;Tomashov Mazovjecki;Tomashuv-Mazovec'kij;Tomashuv-Mazovecki;Tomashuv-Mazoveckij;Tomasova Mazovecka;Tomaszow;Tomaszow Mazowiecki;Tomaszów;Tomaszów Mazowiecki;Tomašova Mazovecka;ma zuo fu she de qu tuo ma shu fu;twmswb mzwbyyzqy;tx ma chuf ma sx weiyt ski;Томашов Мазовецки;Томашов Мазовјецки;Томашув-Мазовецки;Томашув-Мазовецкий;Томашув-Мазовецький;טומשוב מזובייצקי;ตอมาชูฟมาซอเวียตสกี;馬佐夫舍地區托馬舒夫;
|
||||||
city::757026;Tarnow;Gorad Tarnuu;Tarnov;Tarnova;Tarnow;Tarnuv;Tarnuvas;Tarnuw;Tarnów;Tarnůw;ta er nu fu;taleunupeu;tarnwf;tarunufu;trnwb;Горад Тарнуў;Тарнов;Тарнув;טארנע;טרנוב;تارنوف;タルヌフ;塔爾努夫;타르누프;
|
city::757026;Tarnow;Gorad Tarnuu;Tarnov;Tarnova;Tarnow;Tarnuv;Tarnuvas;Tarnuw;Tarnów;Tarnůw;ta er nu fu;taleunupeu;tarnwf;tarunufu;trnwb;Горад Тарнуў;Тарнов;Тарнув;טארנע;טרנוב;تارنوف;タルヌフ;塔爾努夫;타르누프;
|
||||||
city::757033;Tarnobrzeg;Gorad Tarnobzhag;Nova Tarnovia;QEP;Tarnobjeg;Tarnobrzeg;Tarnobrzyg;Tarnobzega;Tarnobzegas;Tarnobzheg;Tarnobzhege;Tarnobžega;Tarnobžegas;ta er nuo bu re ge;taleunobeujekeu;tarnwbzk;tarunobujeku;trnwbzg;Горад Тарнобжаг;Тарнобжег;Тарнобжеге;דזשיקאוו;טרנובזג;تارنوبزک;タルノブジェク;塔尔诺布热格;타르노브제크;
|
city::757033;Tarnobrzeg;Gorad Tarnobzhag;Nova Tarnovia;QEP;Tarnobjeg;Tarnobrzeg;Tarnobrzyg;Tarnobzega;Tarnobzegas;Tarnobzheg;Tarnobzhege;Tarnobžega;Tarnobžegas;ta er nuo bu re ge;taleunobeujekeu;tarnwbzk;tarunobujeku;trnwbzg;Горад Тарнобжаг;Тарнобжег;Тарнобжеге;דזשיקאוו;טרנובזג;تارنوبزک;タルノブジェク;塔尔诺布热格;타르노브제크;
|
||||||
|
@ -6229,7 +6229,7 @@ city::3080165;Zielona Gora;Gorad Zjaljona-Gura;Gruentberg;Grunberg;Grünberg;IEG
|
||||||
city::3080251;Zgierz;Gorad Zgezh;Zgeza;Zgezas;Zgezh;Zgeža;Zgežas;Zgierz;Zgjezh;Znkies;ci gai ri;jeugiesi;sex keiyr ch;zgyyz;zugyeshi;Ζγκιες;Горад Згеж;Згеж;Згјеж;זגייז;เซอเกียร์ช;ズギェシ;兹盖日;즈기에시;
|
city::3080251;Zgierz;Gorad Zgezh;Zgeza;Zgezas;Zgezh;Zgeža;Zgežas;Zgierz;Zgjezh;Znkies;ci gai ri;jeugiesi;sex keiyr ch;zgyyz;zugyeshi;Ζγκιες;Горад Згеж;Згеж;Згјеж;זגייז;เซอเกียร์ช;ズギェシ;兹盖日;즈기에시;
|
||||||
city::3080526;Zawiercie;Zaverce;Zaverche;Zavercis;Zavertse;Zaviercis;Zavjerce;Zawiercie;zha wei er qie;zwwyyrzh;Заверце;Заверче;Завјерће;זוויירצה;扎维尔切;
|
city::3080526;Zawiercie;Zaverce;Zaverche;Zavercis;Zavertse;Zaviercis;Zavjerce;Zawiercie;zha wei er qie;zwwyyrzh;Заверце;Заверче;Завјерће;זוויירצה;扎维尔切;
|
||||||
city::3080985;Zabrze;Gorad Zabzheh;Hindenburg;Zabje;Zaborze;Zabrze;Zabzas;Zabze;Zabzhe;Zabžas;Zabže;Zobrze;jabeuje;sab che;zabjh;zabuje;zabzhh;zha bu re;zʼbzh;Горад Забжэ;Забже;Զաբժե;זאבזה;زابجه;زابژه;زبرزے;ซาบเช;ザブジェ;扎布热;자브제;
|
city::3080985;Zabrze;Gorad Zabzheh;Hindenburg;Zabje;Zaborze;Zabrze;Zabzas;Zabze;Zabzhe;Zabžas;Zabže;Zobrze;jabeuje;sab che;zabjh;zabuje;zabzhh;zha bu re;zʼbzh;Горад Забжэ;Забже;Զաբժե;זאבזה;زابجه;زابژه;زبرزے;ซาบเช;ザブジェ;扎布热;자브제;
|
||||||
city::3081368;Wroclaw;Brassel;Breslau;Breslavia;Breslavl';Breslavl’;Gorad Vroclau;Vratislav;Vratislavia;Vroclav;Vroclava;Vroclavas;Vroclavo;Vroklave;Vroslav;Vrotslav;WRO;Wroclaw;Wroclow;Wrocław;Wrocłow;Wroklaw;Wroslaw;Wrosław;Wrócław;beulocheuwapeu;frwtswaf;fu luo ci wa fu;viratscahp;vrotsavapha;vrotslavi;vurotsuwafu;w rxtswaf;wrwslaw;wrwtswaf;wrwzlb;Βρότσλαβ;Вроцлав;Горад Вроцлаў;ברעסלוי;ורוצלב;فروتسواف;وروتسواف;وروسلاو;ڤرۆتسواف;व्रोत्सवाफ;விராத்ஸ்சாஃப்;วรอตสวัฟ;ვროცლავი;ヴロツワフ;弗罗茨瓦夫;브로츠와프;
|
city::3081368;Wroclaw;Brassel;Breslau;Breslavia;Breslavl';Breslavl’;Gorad Vroclau;Vratislav;Vratislavia;Vroclav;Vroclava;Vroclavas;Vroclavo;Vroklave;Vroslav;Vrotslav;WRO;Wroclaw;Wroclow;Wrocław;Wrocłow;Wroklaw;Wroslaw;Wrosław;Wrócław;beulocheuwapeu;frwtswaf;fu luo ci wa fu;viratscahp;vrotsavapha;vrotslavi;vurotsuwafu;w rxtswaf;wrwslaw;wrwtswaf;wrwzlb;Βρότσλαβ;Вроцлав;Горад Вроцлаў;ברעסלוי;ורוצלב;فروتسواف;وروتسواف;وروسلاو;ڤرۆتسواف;व्रोत्सवाफ;விராத்ஸ்சாஃப்;วรอตสวัฟ;ვროცლავი;ヴロツワフ;弗罗茨瓦夫;브로츠와프;Wrocławska;
|
||||||
city::3081741;Wloclawek;Gorad Ulaclavak;Leslau;Vloclavek;Vloclaveka;Vloclavekas;Wloclawek;Włocławek;beuwocheuwabekeu;fu wo ci wa wei ke;fwtswafk;vuu~otsuwavu~eku;w wxts wa wek;Влоцлавек;Горад Улацлавак;ולוצלאווק;فوتسوافك;ووتسواوک;ววอตซวาเวก;ヴウォツワヴェク;弗沃茨瓦韦克;브워츠와베크;
|
city::3081741;Wloclawek;Gorad Ulaclavak;Leslau;Vloclavek;Vloclaveka;Vloclavekas;Wloclawek;Włocławek;beuwocheuwabekeu;fu wo ci wa wei ke;fwtswafk;vuu~otsuwavu~eku;w wxts wa wek;Влоцлавек;Горад Улацлавак;ולוצלאווק;فوتسوافك;ووتسواوک;ววอตซวาเวก;ヴウォツワヴェク;弗沃茨瓦韦克;브워츠와베크;
|
||||||
city::3082707;Walbrzych;Gorad Valbzhykh;Valbzhikh;Valbziha;Valbzychas;Valbžiha;Valbžychas;Walbrzych;Waldenburg;Waldenburg in Schlesien;Wałbrzych;baubeujiheu;vu~aubujifu;wa lb cik;wa wu bu ri he;wawb jykh;wawb zhykh;wlbzyk;Валбжих;Горад Валбжых;ולבזיך;واوب جيخ;واوب ژیخ;วาลบ์จิก;ヴァウブジフ;瓦烏布日赫;바우브지흐;
|
city::3082707;Walbrzych;Gorad Valbzhykh;Valbzhikh;Valbziha;Valbzychas;Valbžiha;Valbžychas;Walbrzych;Waldenburg;Waldenburg in Schlesien;Wałbrzych;baubeujiheu;vu~aubujifu;wa lb cik;wa wu bu ri he;wawb jykh;wawb zhykh;wlbzyk;Валбжих;Горад Валбжых;ולבזיך;واوب جيخ;واوب ژیخ;วาลบ์จิก;ヴァウブジフ;瓦烏布日赫;바우브지흐;
|
||||||
city::3082914;Tychy;Tichau;Tihi;Tikhi;Tychos;Tychy;Tykhy;Tıhı;di hei;thi khi;tihi;tixa;tyshy;Тихи;Тыхы;تیشی;ทิคี;ティヒ;蒂黑;티히;
|
city::3082914;Tychy;Tichau;Tihi;Tikhi;Tychos;Tychy;Tykhy;Tıhı;di hei;thi khi;tihi;tixa;tyshy;Тихи;Тыхы;تیشی;ทิคี;ティヒ;蒂黑;티히;
|
||||||
|
|
Can't render this file because it is too large.
|
|
@ -1,6 +1,6 @@
|
||||||
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
|
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
|
||||||
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
||||||
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα
|
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza;
|
||||||
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
||||||
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
|
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
|
||||||
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
|
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
|
||||||
|
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
|
||||||
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
||||||
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
||||||
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
||||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university
|
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology;
|
||||||
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
||||||
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
||||||
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
||||||
|
@ -47,13 +47,13 @@ key::46;division;divisione;division;отделение;divisie;τμήμα
|
||||||
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
|
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
|
||||||
key::48;promotion;promozione;продвижение;proothisis;forderung
|
key::48;promotion;promozione;продвижение;proothisis;forderung
|
||||||
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
|
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
|
||||||
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;
|
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;
|
||||||
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
|
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
|
||||||
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
|
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
|
||||||
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
|
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
|
||||||
key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia;
|
key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia;
|
||||||
key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik;
|
key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik;
|
||||||
key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon;
|
key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon;informatycznych;
|
||||||
key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne;
|
key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne;
|
||||||
key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
|
key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
|
||||||
key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
|
key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
|
||||||
|
@ -71,7 +71,7 @@ key::70;veterinary;veterinaria;veterinarie;veterinaria;veterinária;tierärtzlic
|
||||||
key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia;
|
key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia;
|
||||||
key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek;
|
key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek;
|
||||||
key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia;
|
key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia;
|
||||||
key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar;
|
key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar;wojskowa;
|
||||||
key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline;
|
key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline;
|
||||||
key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika;
|
key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika;
|
||||||
key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus;
|
key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus;
|
||||||
|
|
|
|
@ -137,6 +137,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
System.out.println("s3 = " + s3);
|
System.out.println("s3 = " + s3);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(title(s3))));
|
System.out.println(cf.apply(Lists.newArrayList(title(s3))));
|
||||||
|
|
||||||
|
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
||||||
|
System.out.println("s4 = " + s4);
|
||||||
|
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,7 +51,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State");
|
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State");
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
assertEquals(1.0, result);
|
assertEquals(0.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -114,4 +114,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
assertTrue(result> 0.9);
|
assertTrue(result> 0.9);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJaroWinklerNormalizedName8() {
|
||||||
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
|
||||||
|
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology");
|
||||||
|
|
||||||
|
System.out.println("result = " + result);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue