forked from D-Net/dnet-hadoop
addition of a fixSpecial function to address the problem with special character in organization names, addition of new terms in translation maps
This commit is contained in:
parent
4502b44337
commit
8c867101ef
|
@ -1,16 +1,15 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("keywordsclustering")
|
||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||
|
||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||
|
||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
public KeywordsClustering(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
@ -36,4 +35,18 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
|
||||
return combinations;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(this::doApply)
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
}
|
|
@ -6,7 +6,6 @@ import com.google.common.collect.Iterables;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.FieldListImpl;
|
||||
|
@ -29,7 +28,6 @@ import java.util.stream.Stream;
|
|||
*/
|
||||
public abstract class AbstractPaceFunctions {
|
||||
|
||||
|
||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
|
@ -44,7 +42,10 @@ public abstract class AbstractPaceFunctions {
|
|||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
private static final String special_from = "İə";
|
||||
private static final String special_to = "Ie";
|
||||
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
|
@ -55,7 +56,8 @@ public abstract class AbstractPaceFunctions {
|
|||
}
|
||||
|
||||
protected String cleanup(final String s) {
|
||||
final String s0 = s.toLowerCase();
|
||||
final String ss = fixSpecial(s); //TODO is there something implemented to replace strange symbols with latin letters?
|
||||
final String s0 = ss.toLowerCase();
|
||||
final String s1 = fixAliases(s0);
|
||||
final String s2 = nfd(s1);
|
||||
final String s3 = s2.replaceAll("–", " ");
|
||||
|
@ -98,6 +100,16 @@ public abstract class AbstractPaceFunctions {
|
|||
return s.replaceAll("\\D", "");
|
||||
}
|
||||
|
||||
//sometimes the toLowerCase() produces error, this is meant to prevent them by replacing special character before the lowercase function
|
||||
protected static String fixSpecial(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
final int i = StringUtils.indexOf(special_from, ch);
|
||||
sb.append(i >= 0 ? special_to.charAt(i) : ch);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
|
@ -154,7 +166,7 @@ public abstract class AbstractPaceFunctions {
|
|||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
protected String filterAllStopWords(String s) {
|
||||
public String filterAllStopWords(String s) {
|
||||
|
||||
s = filterStopWords(s, stopwords_en);
|
||||
s = filterStopWords(s, stopwords_de);
|
||||
|
@ -193,12 +205,12 @@ public abstract class AbstractPaceFunctions {
|
|||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s: IOUtils.readLines(JaroWinklerNormalizedName.class.getResourceAsStream(classpath))) {
|
||||
for (final String s: IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
|
||||
//string is like this: code;word1;word2;word3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (String key: line){
|
||||
m.put(fixAliases(key).toLowerCase(),value);
|
||||
for (int i=1; i<line.length;i++){
|
||||
m.put(line[i].toLowerCase(),value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e){
|
||||
|
@ -287,9 +299,7 @@ public abstract class AbstractPaceFunctions {
|
|||
//get the list of codes into the input string
|
||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
|
||||
|
||||
String s = cleanup(s1);
|
||||
|
||||
s = filterAllStopWords(s);
|
||||
String s = s1;
|
||||
|
||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||
|
||||
|
|
|
@ -7178,7 +7178,7 @@ city::743882;Kastamonu;Castamena;Castamon;Castamoni;Castamonu;Castamuni;KFS;Kast
|
|||
city::743952;Kars;Cars;Gorad Kars;KSY;Kapc;Kars;Karsa;Karsas;Khuars;Kuars;Qars;Qers;Vanand;ka er si;kaleuseu;kar s;karasa;kars;karusu;qars;Καρς;Горад Карс;Карс;Къарс;Хъарс;Ҟарс;Կարս;קארס;قارص;قەرس;كارس;کارس;ਕਾਰਸ;การ์ส;ყარსი;カルス;卡爾斯;카르스;
|
||||
city::744562;Karabuk;Karabiukas;Karabjuk;Karabuek;Karabuk;Karabük;Karampouk;Qerebuk;ka la bi ke;ka la bi ke sheng;kalabwikeu;karabuka;karabwk;karabyukku;Καραμπούκ;Карабук;Карабюк;Карабүк;Карабӱк;قرهبوک;قرہ بوک;كارابوك;کارابوک;ਕਾਰਾਬੁਕ;ყარაბუქი;カラビュック;卡拉比克;卡拉比克省;카라뷔크;
|
||||
city::745028;Izmit;Astacus;Cocaeli;Ismid;Ismit;Isnimid;Izmid;Izmit;Kodja-Eli;Koja-Ili;Nicomedia;Nicomedie;Nicomédie;Nikomedeia;Nikomedia;izumitto;yi zi mi te;İzmit;Измит;イズミット;伊兹密特;
|
||||
city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;Istanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불;
|
||||
city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;İstanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불;
|
||||
city::745169;Inegol;Inegeul;Inegoel;Inegol;İnegöl;
|
||||
city::746666;Goelcuek;Geulzuk;Goelcuek;Gölcük;
|
||||
city::746881;Giresun;Cerasus;Choerades;Gireson;Giresun;Giresunas;Kerasounta;Kerassunde;Kerasun;Kerasunda;Kerasunt;Kiresun;OGU;Pharnacia;ghyrswn;gilesun;giresun;giresuni;grysn;gryswn;ji lei song;Κερασούντα;Гиресун;Ґіресун;Կերասուն;غيرسون;گرهسون;گریسن;گریسون;გირესუნი;ギレスン;吉雷松;기레순;
|
||||
|
|
Can't render this file because it is too large.
|
|
@ -445,7 +445,6 @@ posto
|
|||
potrebbe
|
||||
preferibilmente
|
||||
presa
|
||||
press
|
||||
prima
|
||||
primo
|
||||
principalmente
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
|
||||
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
|
||||
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
||||
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza;
|
||||
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
|
||||
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
||||
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
|
||||
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
|
||||
|
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
|
|||
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
||||
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
||||
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology;
|
||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
|
||||
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
||||
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
||||
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
||||
|
@ -46,8 +46,8 @@ key::45;institution;istituzione;institution;институциональный;i
|
|||
key::46;division;divisione;division;отделение;divisie;τμήμα
|
||||
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
|
||||
key::48;promotion;promozione;продвижение;proothisis;forderung
|
||||
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
|
||||
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;
|
||||
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline
|
||||
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;technical;texniki;teknik
|
||||
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
|
||||
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
|
||||
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
|
||||
|
@ -101,3 +101,4 @@ key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardku
|
|||
key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
|
||||
key::102;informatics;informatica;informática;informática;informatica;
|
||||
key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
|
||||
key:104;commerce;ticaret;ticarət;commercio;trade;handel;comercio;
|
|
|
@ -141,6 +141,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
System.out.println("s4 = " + s4);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
|
||||
|
||||
final String s5 = "İstanbul Ticarət Universiteti";
|
||||
System.out.println("s5 = " + s5);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s5))));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -130,4 +130,22 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName9() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti");
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName10(){
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence");
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue