addition of a fixSpecial function to address the problem with special character in organization names, addition of new terms in translation maps

This commit is contained in:
miconis 2019-08-06 17:06:05 +02:00
parent 4502b44337
commit 8c867101ef
7 changed files with 68 additions and 23 deletions

View File

@ -1,16 +1,15 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
public KeywordsClustering(Map<String, Integer> params) {
super(params);
}
@ -36,4 +35,18 @@ public class KeywordsClustering extends AbstractClusteringFunction {
return combinations;
}
@Override
public Collection<String> apply(List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(this::doApply)
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -6,7 +6,6 @@ import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
@ -29,7 +28,6 @@ import java.util.stream.Stream;
*/
public abstract class AbstractPaceFunctions {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
@ -46,6 +44,9 @@ public abstract class AbstractPaceFunctions {
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
private static final String special_from = "İə";
private static final String special_to = "Ie";
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
@ -55,7 +56,8 @@ public abstract class AbstractPaceFunctions {
}
protected String cleanup(final String s) {
final String s0 = s.toLowerCase();
final String ss = fixSpecial(s); //TODO is there something implemented to replace strange symbols with latin letters?
final String s0 = ss.toLowerCase();
final String s1 = fixAliases(s0);
final String s2 = nfd(s1);
final String s3 = s2.replaceAll("&ndash;", " ");
@ -98,6 +100,16 @@ public abstract class AbstractPaceFunctions {
return s.replaceAll("\\D", "");
}
//sometimes the toLowerCase() produces error, this is meant to prevent them by replacing special character before the lowercase function
protected static String fixSpecial(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(special_from, ch);
sb.append(i >= 0 ? special_to.charAt(i) : ch);
}
return sb.toString();
}
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
@ -154,7 +166,7 @@ public abstract class AbstractPaceFunctions {
return sb.toString().trim();
}
protected String filterAllStopWords(String s) {
public String filterAllStopWords(String s) {
s = filterStopWords(s, stopwords_en);
s = filterStopWords(s, stopwords_de);
@ -193,12 +205,12 @@ public abstract class AbstractPaceFunctions {
public static Map<String, String> loadMapFromClasspath(final String classpath) {
final Map<String, String> m = new HashMap<>();
try {
for (final String s: IOUtils.readLines(JaroWinklerNormalizedName.class.getResourceAsStream(classpath))) {
for (final String s: IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
//string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
for (String key: line){
m.put(fixAliases(key).toLowerCase(),value);
for (int i=1; i<line.length;i++){
m.put(line[i].toLowerCase(),value);
}
}
} catch (final Throwable e){
@ -287,9 +299,7 @@ public abstract class AbstractPaceFunctions {
//get the list of codes into the input string
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
String s = cleanup(s1);
s = filterAllStopWords(s);
String s = s1;
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));

View File

@ -7178,7 +7178,7 @@ city::743882;Kastamonu;Castamena;Castamon;Castamoni;Castamonu;Castamuni;KFS;Kast
city::743952;Kars;Cars;Gorad Kars;KSY;Kapc;Kars;Karsa;Karsas;Khuars;Kuars;Qars;Qers;Vanand;ka er si;kaleuseu;kar s;karasa;kars;karusu;qars;Καρς;Горад Карс;Карс;Къарс;Хъарс;Ҟарс;Կարս;קארס;قارص;قەرس;كارس;کارس;ਕਾਰਸ;การ์ส;ყარსი;カルス;卡爾斯;카르스;
city::744562;Karabuk;Karabiukas;Karabjuk;Karabuek;Karabuk;Karabük;Karampouk;Qerebuk;ka la bi ke;ka la bi ke sheng;kalabwikeu;karabuka;karabwk;karabyukku;Καραμπούκ;Карабук;Карабюк;Карабүк;Карабӱк;قره‌بوک;قرہ بوک;كارابوك;کارابوک;ਕਾਰਾਬੁਕ;ყარაბუქი;カラビュック;卡拉比克;卡拉比克省;카라뷔크;
city::745028;Izmit;Astacus;Cocaeli;Ismid;Ismit;Isnimid;Izmid;Izmit;Kodja-Eli;Koja-Ili;Nicomedia;Nicomedie;Nicomédie;Nikomedeia;Nikomedia;izumitto;yi zi mi te;İzmit;Измит;イズミット;伊兹密特;
city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;Istanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불;
city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;İstanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불;
city::745169;Inegol;Inegeul;Inegoel;Inegol;İnegöl;
city::746666;Goelcuek;Geulzuk;Goelcuek;Gölcük;
city::746881;Giresun;Cerasus;Choerades;Gireson;Giresun;Giresunas;Kerasounta;Kerassunde;Kerasun;Kerasunda;Kerasunt;Kiresun;OGU;Pharnacia;ghyrswn;gilesun;giresun;giresuni;grysn;gryswn;ji lei song;Κερασούντα;Гиресун;Ґіресун;Կերասուն;غيرسون;گره‌سون;گریسن;گریسون;გირესუნი;ギレスン;吉雷松;기레순;

Can't render this file because it is too large.

View File

@ -445,7 +445,6 @@ posto
potrebbe
preferibilmente
presa
press
prima
primo
principalmente

View File

@ -1,6 +1,6 @@
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza;
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology;
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
@ -46,8 +46,8 @@ key::45;institution;istituzione;institution;институциональный;i
key::46;division;divisione;division;отделение;divisie;τμήμα
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
key::48;promotion;promozione;продвижение;proothisis;forderung
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;technical;texniki;teknik
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
@ -101,3 +101,4 @@ key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardku
key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
key::102;informatics;informatica;informática;informática;informatica;
key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
key:104;commerce;ticaret;ticarət;commercio;trade;handel;comercio;
1 key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
2 key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
3 key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza; key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
4 key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
5 key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
6 key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
38 key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
39 key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
40 key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
41 key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology; key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
42 key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
43 key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
44 key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
46 key::46;division;divisione;division;отделение;divisie;τμήμα
47 key::47;committee;comitato;comité;комитет;commissie;επιτροπή
48 key::48;promotion;promozione;продвижение;proothisis;forderung
49 key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline; key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline
50 key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii; key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;technical;texniki;teknik
51 key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
52 key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
53 key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
101 key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
102 key::102;informatics;informatica;informática;informática;informatica;
103 key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
104 key:104;commerce;ticaret;ticarət;commercio;trade;handel;comercio;

View File

@ -141,6 +141,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println("s4 = " + s4);
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5);
System.out.println(cf.apply(Lists.newArrayList(title(s5))));
}
}

View File

@ -130,4 +130,22 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
System.out.println("result = " + result);
}
@Test
public void testJaroWinklerNormalizedName9() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti");
System.out.println("result = " + result);
}
@Test
public void testJaroWinklerNormalizedName10(){
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence");
System.out.println("result = " + result);
}
}