From a85576c27e7b79c972a5dfce82bb703492eba6b7 Mon Sep 17 00:00:00 2001 From: miconis Date: Fri, 19 Jul 2019 17:10:29 +0200 Subject: [PATCH] restyling of the JaroWinklerNormalizedName comparator, now it is optimized. Addition of some translations in the translation maps, addition of a clustering based on keywords in organizations legalnames --- .../pace/clustering/KeywordsClustering.java | 9 +- .../pace/common/AbstractPaceFunctions.java | 121 +++++++----------- .../algo/JaroWinklerNormalizedName.java | 38 ++---- .../eu/dnetlib/pace/config/city_map.csv | 4 +- .../dnetlib/pace/config/translation_map.csv | 12 +- .../clustering/ClusteringFunctionTest.java | 4 + .../pace/distance/DistanceAlgoTest.java | 11 +- 7 files changed, 87 insertions(+), 112 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index dfd59384b..29d095230 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -1,6 +1,5 @@ package eu.dnetlib.pace.clustering; -import com.google.common.base.Joiner; import eu.dnetlib.pace.common.AbstractPaceFunctions; import java.util.*; @@ -20,14 +19,14 @@ public class KeywordsClustering extends AbstractClusteringFunction { protected Collection doApply(String s) { //takes city codes and keywords codes without duplicates - Set keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4)); - Set cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4)); + Set keywords = getKeywords(s, params.getOrDefault("windowSize", 4)); + Set cities = getCities(s, params.getOrDefault("windowSize", 4)); //list of combination to return as result final Collection combinations = new LinkedHashSet(); - for (String keyword: keywords){ - for (String city: cities) { + for (String keyword: keywordsToCodes(keywords)){ + for (String city: citiesToCodes(cities)) { combinations.add(keyword+"-"+city); if (combinations.size()>=params.getOrDefault("max", 2)) { return combinations; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 748eec9a6..a5f7bf679 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -18,6 +18,8 @@ import java.text.Normalizer; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Set of common functions @@ -27,6 +29,10 @@ import java.util.regex.Pattern; */ public abstract class AbstractPaceFunctions { + + private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv"); + private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); + protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); @@ -212,99 +218,58 @@ public abstract class AbstractPaceFunctions { return sb.toString().trim(); } - public String keywordsToCode(String s1, Map translationMap, int windowSize){ + public String removeKeywords(String s, Set keywords) { - List tokens = Arrays.asList(s1.split(" ")); - - if (tokens.size() s1, Set s2){ - public double keywordsCompare(String s1, String s2){ + Set k1 = keywordsToCodes(s1); + Set k2 = keywordsToCodes(s2); - List keywords1 = getKeywords(s1); - List keywords2 = getKeywords(s2); - int longer = (keywords1.size()>keywords2.size())?keywords1.size():keywords2.size(); + int longer = (k1.size()>k2.size())?k1.size():k2.size(); - if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty()) + if (k1.isEmpty() || k2.isEmpty()) return 1.0; else - return (double)CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()/(double)longer; + return (double)CollectionUtils.intersection(k1,k2).size()/(double)longer; } - //check if 2 strings have same keywords - public boolean sameKeywords(String s1, String s2){ - //at least 1 keyword in common - if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty()) - return true; - else - return CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()>0; - } - //returns true if at least 1 city is in common - //returns true if a name has no cities - public boolean sameCity(String s1, String s2){ + //returns true if no cities are contained in names + //returns false if one of the two names have no city + public boolean sameCity(Set s1, Set s2){ - if (getCities(s1).isEmpty() || getCities(s2).isEmpty()) + Set c1 = citiesToCodes(s1); + Set c2 = citiesToCodes(s2); + + if (c1.isEmpty() && c2.isEmpty()) return true; - else - return CollectionUtils.intersection(getCities(s1), getCities(s2)).size()>0; + else { + if (c1.isEmpty() ^ c2.isEmpty()) + return false; + return CollectionUtils.intersection(c1, c2).size() > 0; + } } - //get the list of keywords in a string - public List getCities(String s) { - - final String regex = "\\bcity::[0-9]*\\b"; - - Pattern p = Pattern.compile(regex, Pattern.MULTILINE); - Matcher m = p.matcher(s); - List codes = new ArrayList<>(); - while (m.find()) { - codes.add(m.group(0)); - for (int i = 1; i <= m.groupCount(); i++) { - codes.add(m.group(0)); - } - } - return codes; + //convert the set of keywords to codes + public Set toCodes(Set keywords, Map translationMap) { + return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); } - //get the list of keywords in a string - public List getKeywords(String s) { + public Set keywordsToCodes(Set keywords) { + return toCodes(keywords, translationMap); + } - final String regex = "\\bkey::[0-9]*\\b"; - - Pattern p = Pattern.compile(regex, Pattern.MULTILINE); - Matcher m = p.matcher(s); - List codes = new ArrayList<>(); - while (m.find()) { - codes.add(m.group(0)); - for (int i = 1; i <= m.groupCount(); i++) { - codes.add(m.group(0)); - } - } - return codes; + public Set citiesToCodes(Set keywords) { + return toCodes(keywords, cityMap); } protected String firstLC(final String s) { @@ -320,7 +285,7 @@ public abstract class AbstractPaceFunctions { } //get the list of codes into the input string - public Set getCodes(String s1, Map translationMap, int windowSize){ + public Set getKeywords(String s1, Map translationMap, int windowSize){ String s = cleanup(s1); @@ -340,7 +305,7 @@ public abstract class AbstractPaceFunctions { for (int i = 0; i<=tokens.size()-length; i++){ String candidate = Joiner.on(" ").join(tokens.subList(i, i + length)); if (translationMap.containsKey(candidate)) { - codes.add(translationMap.get(candidate)); + codes.add(candidate); s = s.replace(candidate, ""); } } @@ -352,4 +317,12 @@ public abstract class AbstractPaceFunctions { return codes; } + public Set getKeywords(String s1, int windowSize) { + return getKeywords(s1, translationMap, windowSize); + } + + public Set getCities(String s1, int windowSize) { + return getKeywords(s1, cityMap, windowSize); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java index 285575a1b..546629b08 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java @@ -11,18 +11,6 @@ import java.util.Set; @DistanceClass("JaroWinklerNormalizedName") public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { - private static Set stopwordsEn = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); - private static Set stopwordsIt = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); - private static Set stopwordsDe = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); - private static Set stopwordsFr = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt"); - private static Set stopwordsPt = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); - private static Set stopwordsEs = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); - - //key=word, value=global identifier => example: "università"->"university", used to substitute the word with the global identifier - private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv"); - - private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); - private Map params; public JaroWinklerNormalizedName(Map params){ @@ -49,28 +37,30 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - //replace keywords with codes - String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue()); - String codesB = keywordsToCode(cb, translationMap, params.getOrDefault("windowSize",4).intValue()); + Set keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue()); + Set keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue()); - //replace cities with codes - codesA = keywordsToCode(codesA, cityMap, params.getOrDefault("windowSize", 4).intValue()); - codesB = keywordsToCode(codesB, cityMap, params.getOrDefault("windowSize", 4).intValue()); + Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); + Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); + + if (sameCity(cities1,cities2)) { + + if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { + + ca = removeKeywords(ca, keywords1); + ca = removeKeywords(ca, cities1); + cb = removeKeywords(cb, keywords2); + cb = removeKeywords(cb, cities2); - //if two names have same city - if (sameCity(codesA,codesB)){ - if (keywordsCompare(codesA, codesB)>params.getOrDefault("threshold", 0.5).doubleValue()) { - ca = removeCodes(codesA); - cb = removeCodes(codesB); if (ca.isEmpty() && cb.isEmpty()) return 1.0; else return normalize(ssalgo.score(ca,cb)); + } } return 0.0; - } @Override diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv index e2d48551d..666615792 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv @@ -6192,7 +6192,7 @@ city::753142;Zoliborz;Zalborz;Zalbórz; city::753866;Zamosc;Gorad Zamasc';Zamosc;Zamosc';Zamosca;Zamoscia;Zamose;Zamoshc;Zamoshch;Zamost'ye;Zamoste;Zamostja;Zamosts;Zamostė;Zamost’ye;Zamość;jamosichi;sa mxchch;zamoshichi;zamostsi;zha mo xi qi;zmwsz;Ζάμοστς;Горад Замасць;Замостя;Замосць;Замошч;Замошћ;Զամոշչ;זמושץ;ซามอชช์;ზამოსცი;ザモシチ;扎莫希奇;자모시치; city::755330;Wola;Volja;Воля; city::756092;Wawer;; -city::756135;Warsaw;Barsobia;Varsa;Varsava;Varsavia;Varsavja;Varshava;Varshavae;Varsja;Varsjá;Varso;Varsova;Varsovia;Varsovia - Warszawa;Varsovie;Varsovio;Varssavi;Varsuva;Varsòvia;Varsó;Varsóvia;Varşova;Varšava;Varšuva;Varșovia;Vársá;WAW;Warsaw;Warsawa;Warschau;Warskou;Warszaw;Warszawa;Waršawa;baleusyaba;hua sha;varshava;vorso;warsw;warushawa;wrsh;wrshw;wrsw;wxrsx;Βαρσοβία;Варшавæ;Варшава;Վարշավա;ווארשע;ורשה;װאַרשע;وارسو;ورشو;ۋارشاۋا;ܘܪܣܘ;वॉर्सो;วอร์ซอ;ვარშავა;ዋርሶው;ワルシャワ;华沙;華沙;바르샤바; +city::756135;Warsaw;warszawie;Barsobia;Varsa;Varsava;Varsavia;Varsavja;Varshava;Varshavae;Varsja;Varsjá;Varso;Varsova;Varsovia;Varsovia - Warszawa;Varsovie;Varsovio;Varssavi;Varsuva;Varsòvia;Varsó;Varsóvia;Varşova;Varšava;Varšuva;Varșovia;Vársá;WAW;Warsaw;Warsawa;Warschau;Warskou;Warszaw;Warszawa;Waršawa;baleusyaba;hua sha;varshava;vorso;warsw;warushawa;wrsh;wrshw;wrsw;wxrsx;Βαρσοβία;Варшавæ;Варшава;Վարշավա;ווארשע;ורשה;װאַרשע;وارسو;ورشو;ۋارشاۋا;ܘܪܣܘ;वॉर्सो;วอร์ซอ;ვარშავა;ዋርሶው;ワルシャワ;华沙;華沙;바르샤바;warszawskiej;warszawska; city::756867;Tomaszow Mazowiecki;Mazovijos Tomasuvas;Mazovijos Tomašuvas;Thomasovia;Tomashov Mazovecki;Tomashov Mazovjecki;Tomashuv-Mazovec'kij;Tomashuv-Mazovecki;Tomashuv-Mazoveckij;Tomasova Mazovecka;Tomaszow;Tomaszow Mazowiecki;Tomaszów;Tomaszów Mazowiecki;Tomašova Mazovecka;ma zuo fu she de qu tuo ma shu fu;twmswb mzwbyyzqy;tx ma chuf ma sx weiyt ski;Томашов Мазовецки;Томашов Мазовјецки;Томашув-Мазовецки;Томашув-Мазовецкий;Томашув-Мазовецький;טומשוב מזובייצקי;ตอมาชูฟมาซอเวียตสกี;馬佐夫舍地區托馬舒夫; city::757026;Tarnow;Gorad Tarnuu;Tarnov;Tarnova;Tarnow;Tarnuv;Tarnuvas;Tarnuw;Tarnów;Tarnůw;ta er nu fu;taleunupeu;tarnwf;tarunufu;trnwb;Горад Тарнуў;Тарнов;Тарнув;טארנע;טרנוב;تارنوف;タルヌフ;塔爾努夫;타르누프; city::757033;Tarnobrzeg;Gorad Tarnobzhag;Nova Tarnovia;QEP;Tarnobjeg;Tarnobrzeg;Tarnobrzyg;Tarnobzega;Tarnobzegas;Tarnobzheg;Tarnobzhege;Tarnobžega;Tarnobžegas;ta er nuo bu re ge;taleunobeujekeu;tarnwbzk;tarunobujeku;trnwbzg;Горад Тарнобжаг;Тарнобжег;Тарнобжеге;דזשיקאוו;טרנובזג;تارنوبزک;タルノブジェク;塔尔诺布热格;타르노브제크; @@ -6229,7 +6229,7 @@ city::3080165;Zielona Gora;Gorad Zjaljona-Gura;Gruentberg;Grunberg;Grünberg;IEG city::3080251;Zgierz;Gorad Zgezh;Zgeza;Zgezas;Zgezh;Zgeža;Zgežas;Zgierz;Zgjezh;Znkies;ci gai ri;jeugiesi;sex keiyr ch;zgyyz;zugyeshi;Ζγκιες;Горад Згеж;Згеж;Згјеж;זגייז;เซอเกียร์ช;ズギェシ;兹盖日;즈기에시; city::3080526;Zawiercie;Zaverce;Zaverche;Zavercis;Zavertse;Zaviercis;Zavjerce;Zawiercie;zha wei er qie;zwwyyrzh;Заверце;Заверче;Завјерће;זוויירצה;扎维尔切; city::3080985;Zabrze;Gorad Zabzheh;Hindenburg;Zabje;Zaborze;Zabrze;Zabzas;Zabze;Zabzhe;Zabžas;Zabže;Zobrze;jabeuje;sab che;zabjh;zabuje;zabzhh;zha bu re;zʼbzh;Горад Забжэ;Забже;Զաբժե;זאבזה;زابجه;زابژه;زبرزے;ซาบเช;ザブジェ;扎布热;자브제; -city::3081368;Wroclaw;Brassel;Breslau;Breslavia;Breslavl';Breslavl’;Gorad Vroclau;Vratislav;Vratislavia;Vroclav;Vroclava;Vroclavas;Vroclavo;Vroklave;Vroslav;Vrotslav;WRO;Wroclaw;Wroclow;Wrocław;Wrocłow;Wroklaw;Wroslaw;Wrosław;Wrócław;beulocheuwapeu;frwtswaf;fu luo ci wa fu;viratscahp;vrotsavapha;vrotslavi;vurotsuwafu;w rxtswaf;wrwslaw;wrwtswaf;wrwzlb;Βρότσλαβ;Вроцлав;Горад Вроцлаў;ברעסלוי;ורוצלב;فروتسواف;وروتسواف;وروسلاو;ڤرۆتسواف;व्रोत्सवाफ;விராத்ஸ்சாஃப்;วรอตสวัฟ;ვროცლავი;ヴロツワフ;弗罗茨瓦夫;브로츠와프; +city::3081368;Wroclaw;Brassel;Breslau;Breslavia;Breslavl';Breslavl’;Gorad Vroclau;Vratislav;Vratislavia;Vroclav;Vroclava;Vroclavas;Vroclavo;Vroklave;Vroslav;Vrotslav;WRO;Wroclaw;Wroclow;Wrocław;Wrocłow;Wroklaw;Wroslaw;Wrosław;Wrócław;beulocheuwapeu;frwtswaf;fu luo ci wa fu;viratscahp;vrotsavapha;vrotslavi;vurotsuwafu;w rxtswaf;wrwslaw;wrwtswaf;wrwzlb;Βρότσλαβ;Вроцлав;Горад Вроцлаў;ברעסלוי;ורוצלב;فروتسواف;وروتسواف;وروسلاو;ڤرۆتسواف;व्रोत्सवाफ;விராத்ஸ்சாஃப்;วรอตสวัฟ;ვროცლავი;ヴロツワフ;弗罗茨瓦夫;브로츠와프;Wrocławska; city::3081741;Wloclawek;Gorad Ulaclavak;Leslau;Vloclavek;Vloclaveka;Vloclavekas;Wloclawek;Włocławek;beuwocheuwabekeu;fu wo ci wa wei ke;fwtswafk;vuu~otsuwavu~eku;w wxts wa wek;Влоцлавек;Горад Улацлавак;ולוצלאווק;فوتسوافك;ووتس‌واوک;ววอตซวาเวก;ヴウォツワヴェク;弗沃茨瓦韦克;브워츠와베크; city::3082707;Walbrzych;Gorad Valbzhykh;Valbzhikh;Valbziha;Valbzychas;Valbžiha;Valbžychas;Walbrzych;Waldenburg;Waldenburg in Schlesien;Wałbrzych;baubeujiheu;vu~aubujifu;wa lb cik;wa wu bu ri he;wawb jykh;wawb zhykh;wlbzyk;Валбжих;Горад Валбжых;ולבזיך;واوب جيخ;واوب ژیخ;วาลบ์จิก;ヴァウブジフ;瓦烏布日赫;바우브지흐; city::3082914;Tychy;Tichau;Tihi;Tikhi;Tychos;Tychy;Tykhy;Tıhı;di hei;thi khi;tihi;tixa;tyshy;Тихи;Тыхы;تیشی;ทิคี;ティヒ;蒂黑;티히; diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index dc77a6057..bab68360d 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -1,6 +1,6 @@ key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές -key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα +key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza; key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas @@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο key::39;bureau;ufficio;bureau;офис;bureau;γραφείο key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία -key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university +key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology; key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί @@ -47,13 +47,13 @@ key::46;division;divisione;division;отделение;divisie;τμήμα key::47;committee;comitato;comité;комитет;commissie;επιτροπή key::48;promotion;promozione;продвижение;proothisis;forderung key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline; -key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline; +key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii; key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik; key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri; key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus; key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia; key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik; -key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon; +key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon;informatycznych; key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne; key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;; key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri; @@ -71,7 +71,7 @@ key::70;veterinary;veterinaria;veterinarie;veterinaria;veterinária;tierärtzlic key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia; key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek; key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia; -key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar; +key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar;wojskowa; key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline; key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika; key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus; @@ -100,4 +100,4 @@ key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neur key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia; key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia; key::102;informatics;informatica;informática;informática;informatica; -key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association +key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index b845c395e..58f86d01d 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -137,6 +137,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println("s3 = " + s3); System.out.println(cf.apply(Lists.newArrayList(title(s3)))); + final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)"; + System.out.println("s4 = " + s4); + System.out.println(cf.apply(Lists.newArrayList(title(s4)))); + } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 6f196ec4f..11ae4183a 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -51,7 +51,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State"); System.out.println("result = " + result); - assertEquals(1.0, result); + assertEquals(0.0, result); } @Test @@ -114,4 +114,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { assertTrue(result> 0.9); } + @Test + public void testJaroWinklerNormalizedName8() { + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + + double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology"); + + System.out.println("result = " + result); + } + }