From 8c867101ef08034fbe86f338a77f88693f1fe175 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 6 Aug 2019 17:06:05 +0200 Subject: [PATCH] addition of a fixSpecial function to address the problem with special character in organization names, addition of new terms in translation maps --- .../pace/clustering/KeywordsClustering.java | 21 +++++++++--- .../pace/common/AbstractPaceFunctions.java | 32 ++++++++++++------- .../eu/dnetlib/pace/config/city_map.csv | 2 +- .../eu/dnetlib/pace/config/stopwords_it.txt | 1 - .../dnetlib/pace/config/translation_map.csv | 13 ++++---- .../clustering/ClusteringFunctionTest.java | 4 +++ .../pace/distance/DistanceAlgoTest.java | 18 +++++++++++ 7 files changed, 68 insertions(+), 23 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 29d095230..1cabecd60 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -1,16 +1,15 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.model.Field; +import org.apache.commons.lang.StringUtils; import java.util.*; +import java.util.stream.Collectors; @ClusteringClass("keywordsclustering") public class KeywordsClustering extends AbstractClusteringFunction { - private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv"); - - private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); - public KeywordsClustering(Map params) { super(params); } @@ -36,4 +35,18 @@ public class KeywordsClustering extends AbstractClusteringFunction { return combinations; } + + @Override + public Collection apply(List fields) { + return fields.stream().filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here? + .map(this::normalize) + .map(s -> filterAllStopWords(s)) + .map(this::doApply) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } } \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index a5f7bf679..24379c677 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -6,7 +6,6 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.FieldListImpl; @@ -29,7 +28,6 @@ import java.util.stream.Stream; */ public abstract class AbstractPaceFunctions { - private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv"); private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); @@ -44,7 +42,10 @@ public abstract class AbstractPaceFunctions { private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; - private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; + private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; + + private static final String special_from = "İə"; + private static final String special_to = "Ie"; public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; @@ -55,7 +56,8 @@ public abstract class AbstractPaceFunctions { } protected String cleanup(final String s) { - final String s0 = s.toLowerCase(); + final String ss = fixSpecial(s); //TODO is there something implemented to replace strange symbols with latin letters? + final String s0 = ss.toLowerCase(); final String s1 = fixAliases(s0); final String s2 = nfd(s1); final String s3 = s2.replaceAll("–", " "); @@ -98,6 +100,16 @@ public abstract class AbstractPaceFunctions { return s.replaceAll("\\D", ""); } + //sometimes the toLowerCase() produces error, this is meant to prevent them by replacing special character before the lowercase function + protected static String fixSpecial(final String s) { + final StringBuilder sb = new StringBuilder(); + for (final char ch : Lists.charactersOf(s)) { + final int i = StringUtils.indexOf(special_from, ch); + sb.append(i >= 0 ? special_to.charAt(i) : ch); + } + return sb.toString(); + } + protected static String fixAliases(final String s) { final StringBuilder sb = new StringBuilder(); for (final char ch : Lists.charactersOf(s)) { @@ -154,7 +166,7 @@ public abstract class AbstractPaceFunctions { return sb.toString().trim(); } - protected String filterAllStopWords(String s) { + public String filterAllStopWords(String s) { s = filterStopWords(s, stopwords_en); s = filterStopWords(s, stopwords_de); @@ -193,12 +205,12 @@ public abstract class AbstractPaceFunctions { public static Map loadMapFromClasspath(final String classpath) { final Map m = new HashMap<>(); try { - for (final String s: IOUtils.readLines(JaroWinklerNormalizedName.class.getResourceAsStream(classpath))) { + for (final String s: IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) { //string is like this: code;word1;word2;word3 String[] line = s.split(";"); String value = line[0]; - for (String key: line){ - m.put(fixAliases(key).toLowerCase(),value); + for (int i=1; i getKeywords(String s1, Map translationMap, int windowSize){ - String s = cleanup(s1); - - s = filterAllStopWords(s); + String s = s1; List tokens = Arrays.asList(s.toLowerCase().split(" ")); diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv index 666615792..936c80adf 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv @@ -7178,7 +7178,7 @@ city::743882;Kastamonu;Castamena;Castamon;Castamoni;Castamonu;Castamuni;KFS;Kast city::743952;Kars;Cars;Gorad Kars;KSY;Kapc;Kars;Karsa;Karsas;Khuars;Kuars;Qars;Qers;Vanand;ka er si;kaleuseu;kar s;karasa;kars;karusu;qars;Καρς;Горад Карс;Карс;Къарс;Хъарс;Ҟарс;Կարս;קארס;قارص;قەرس;كارس;کارس;ਕਾਰਸ;การ์ส;ყარსი;カルス;卡爾斯;카르스; city::744562;Karabuk;Karabiukas;Karabjuk;Karabuek;Karabuk;Karabük;Karampouk;Qerebuk;ka la bi ke;ka la bi ke sheng;kalabwikeu;karabuka;karabwk;karabyukku;Καραμπούκ;Карабук;Карабюк;Карабүк;Карабӱк;قره‌بوک;قرہ بوک;كارابوك;کارابوک;ਕਾਰਾਬੁਕ;ყარაბუქი;カラビュック;卡拉比克;卡拉比克省;카라뷔크; city::745028;Izmit;Astacus;Cocaeli;Ismid;Ismit;Isnimid;Izmid;Izmit;Kodja-Eli;Koja-Ili;Nicomedia;Nicomedie;Nicomédie;Nikomedeia;Nikomedia;izumitto;yi zi mi te;İzmit;Измит;イズミット;伊兹密特; -city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;Istanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불; +city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;İstanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불; city::745169;Inegol;Inegeul;Inegoel;Inegol;İnegöl; city::746666;Goelcuek;Geulzuk;Goelcuek;Gölcük; city::746881;Giresun;Cerasus;Choerades;Gireson;Giresun;Giresunas;Kerasounta;Kerassunde;Kerasun;Kerasunda;Kerasunt;Kiresun;OGU;Pharnacia;ghyrswn;gilesun;giresun;giresuni;grysn;gryswn;ji lei song;Κερασούντα;Гиресун;Ґіресун;Կերասուն;غيرسون;گره‌سون;گریسن;گریسون;გირესუნი;ギレスン;吉雷松;기레순; diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt index 8d0ceb776..e6131bca9 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt @@ -445,7 +445,6 @@ posto potrebbe preferibilmente presa -press prima primo principalmente diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index 4aad426f0..c74b35786 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -1,6 +1,6 @@ -key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο +key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές -key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza; +key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas @@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο key::39;bureau;ufficio;bureau;офис;bureau;γραφείο key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία -key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology; +key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί @@ -46,8 +46,8 @@ key::45;institution;istituzione;institution;институциональный;i key::46;division;divisione;division;отделение;divisie;τμήμα key::47;committee;comitato;comité;комитет;commissie;επιτροπή key::48;promotion;promozione;продвижение;proothisis;forderung -key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline; -key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii; +key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline +key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;technical;texniki;teknik key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik; key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri; key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus; @@ -100,4 +100,5 @@ key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neur key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia; key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia; key::102;informatics;informatica;informática;informática;informatica; -key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association \ No newline at end of file +key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association +key:104;commerce;ticaret;ticarət;commercio;trade;handel;comercio; \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 265f3973f..84ec09006 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -141,6 +141,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println("s4 = " + s4); System.out.println(cf.apply(Lists.newArrayList(title(s4)))); + final String s5 = "İstanbul Ticarət Universiteti"; + System.out.println("s5 = " + s5); + System.out.println(cf.apply(Lists.newArrayList(title(s5)))); + } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 1cce9a65b..ec55b8713 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -130,4 +130,22 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { System.out.println("result = " + result); } + @Test + public void testJaroWinklerNormalizedName9() { + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + + double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti"); + + System.out.println("result = " + result); + } + + @Test + public void testJaroWinklerNormalizedName10(){ + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + + double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence"); + + System.out.println("result = " + result); + } }