update of the comparator for legalnames of organizations

This commit is contained in:
Michele De Bonis 2019-03-21 14:27:27 +01:00
parent 0735f3a822
commit 662448e584
10 changed files with 165 additions and 248 deletions

View File

@ -1,16 +1,15 @@
package eu.dnetlib.pace.clustering;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
protected Map<String, Integer> params;
@ -26,7 +25,7 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::normalize)
.map(s -> filterStopWords(s, stopwords))
.map(s -> filterAllStopWords(s))
.map(this::doApply)
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())

View File

@ -27,7 +27,12 @@ import java.util.regex.Pattern;
*/
public abstract class AbstractPaceFunctions {
protected static Set<String> stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
@ -42,8 +47,9 @@ public abstract class AbstractPaceFunctions {
}
protected String cleanup(final String s) {
final String s1 = nfd(s);
final String s2 = fixAliases(s1);
final String s0 = s.toLowerCase();
final String s1 = fixAliases(s0);
final String s2 = nfd(s1);
final String s3 = s2.replaceAll("&ndash;", " ");
final String s4 = s3.replaceAll("&amp;", " ");
final String s5 = s4.replaceAll("&quot;", " ");
@ -140,6 +146,18 @@ public abstract class AbstractPaceFunctions {
return sb.toString().trim();
}
protected String filterAllStopWords(String s) {
s = filterStopWords(s, stopwords_en);
s = filterStopWords(s, stopwords_de);
s = filterStopWords(s, stopwords_it);
s = filterStopWords(s, stopwords_fr);
s = filterStopWords(s, stopwords_pt);
s = filterStopWords(s, stopwords_es);
return s;
}
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
final Set<String> newset = Sets.newLinkedHashSet();
for (final String s : set) {
@ -192,15 +210,7 @@ public abstract class AbstractPaceFunctions {
return sb.toString().trim();
}
public String normalizeCities(String s1, Map<String,String> cityMap){
//TODO change normalization mode
for (String city : cityMap.keySet())
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
return s1;
}
public String normalizeCities2 (String s1, Map<String, String> cityMap, int windowSize){
public String keywordsToCode(String s1, Map<String, String> translationMap, int windowSize){
List<String> tokens = Arrays.asList(s1.split(" "));
@ -213,9 +223,8 @@ public abstract class AbstractPaceFunctions {
for (int i = 0; i<=tokens.size()-length; i++){
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
if (cityMap.containsKey(candidate)) {
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + cityMap.get(candidate) + " ");
return s1;
if (translationMap.containsKey(candidate)) {
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + translationMap.get(candidate) + " ");
}
}
length-=1;
@ -229,9 +238,20 @@ public abstract class AbstractPaceFunctions {
final String regexKey = "\\bkey::[0-9]*\\b";
final String regexCity = "\\bcity::[0-9]*\\b";
return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim();
}
public double keywordsCompare(String s1, String s2){
List<String> keywords1 = getKeywords(s1);
List<String> keywords2 = getKeywords(s2);
int longer = (keywords1.size()>keywords2.size())?keywords1.size():keywords2.size();
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
return 1.0;
else
return (double)CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()/(double)longer;
}
//check if 2 strings have same keywords
public boolean sameKeywords(String s1, String s2){
//at least 1 keyword in common

View File

@ -47,27 +47,25 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
cb = removeStopwords(cb);
//replace keywords with codes
ca = translate(ca, translationMap);
cb = translate(cb, translationMap);
String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue());
String codesB = keywordsToCode(cb, translationMap, params.getOrDefault("windowSize",4).intValue());
//replace cities with codes
// String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
// ca = norm.split("\\|\\|\\|")[0].trim();
// cb = norm.split("\\|\\|\\|")[1].trim();
codesA = keywordsToCode(codesA, cityMap, params.getOrDefault("windowSize", 4).intValue());
codesB = keywordsToCode(codesB, cityMap, params.getOrDefault("windowSize", 4).intValue());
ca = normalizeCities2(ca, cityMap, params.getOrDefault("windowSize", 4).intValue());
cb = normalizeCities2(cb, cityMap, params.getOrDefault("windowSize", 4).intValue());
if (sameCity(ca,cb)){
if (sameKeywords(ca,cb)){
ca = removeCodes(ca);
cb = removeCodes(cb);
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
//if two names have same city
if (sameCity(codesA,codesB)){
if (keywordsCompare(codesA, codesB)>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeCodes(codesA);
cb = removeCodes(codesB);
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
}
return 0.0;
}

View File

@ -9,8 +9,6 @@ import eu.dnetlib.pace.distance.eval.ScoreResult;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentComparator;
import eu.dnetlib.pace.model.TreeNodeDef;
import eu.dnetlib.pace.tree.support.MatchType;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -45,67 +43,13 @@ public class BlockProcessor {
if (q.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
//process(q, context);
process(simplifyQueue(q, key, context), context);
//process the decision tree if it is specified, otherwise go with conditions and distance algos
if (!dedupConf.getPace().getDecisionTree().isEmpty()){
processPersons(q, context);
}
else {
process(simplifyQueue(q, key, context), context);
}
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
}
}
private void processPersons(final Queue<MapDocument> queue, final Reporter context) {
while (!queue.isEmpty()) {
final MapDocument pivot = queue.remove(); //take first element of the queue
final String idPivot = pivot.getIdentifier();
//compare the first element with all the others
for (final MapDocument curr : queue) {
final String idCurr = curr.getIdentifier();
//check if pivot and current element are similar by processing the tree
if (navigateTree(pivot, curr)!=MatchType.NO_MATCH)
writeSimilarity(context, idPivot, idCurr);
}
}
}
public MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){
final Map<String, TreeNodeDef> decisionTree = dedupConf.getPace().getDecisionTree();
String current = "start";
while (MatchType.getEnum(current)==MatchType.UNDEFINED) {
TreeNodeDef currentNode = decisionTree.get(current);
//throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current);
double similarity = currentNode.evaluate(doc1, doc2);
if (similarity == -1) {
current = currentNode.getUndefined();
}
else if (similarity>=currentNode.getThreshold()){
current = currentNode.getPositive();
}
else {
current = currentNode.getNegative();
}
}
return MatchType.getEnum(current);
}
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField()));

View File

@ -611,7 +611,6 @@ terzo
th
ti
titolo
torino
tra
tranne
tre

View File

@ -1,11 +1,11 @@
key::1;university;università;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
key::7;college;collegio;université;colegio;faculdade;Hochschule;Szkoła Wyższa;Высшая школа;universiteit;κολλέγιο
key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα
key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα;idryma
key::9;center;centro;centre;centro;centro;zentrum;centrum;центр;centrum;κέντρο
key::10;national;nazionale;national;nationale;nationaux;nationales;nacional;nacional;national;krajowy;национальный;nationaal;nationale;εθνικό
key::11;association;associazione;association;asociación;associação;Verein;verband;stowarzyszenie;ассоциация;associatie
@ -45,3 +45,59 @@ key::44;academic;accademico;académique;universitaire;акадеческий aca
key::45;institution;istituzione;institution;институциональный;instelling;ινστιτούτο
key::46;division;divisione;division;отделение;divisie;τμήμα
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
key::48;promotion;promozione;продвижение;proothisis;forderung
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia;
key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik;
key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon;
key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne;
key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
key::60;pharmaceuticals;pharmacy;farmacia;farmaceutica;farmacéutica;farmacia;farmacêutica;farmácia;Pharmazeutika;Arzneimittelkunde;farmaceutica;geneesmiddelen;apotheek;φαρμακευτικός;φαρμακευτική;φαρμακευτικό;φαρμακευτικά;φαρμακείο;ilaç;eczane;gyógyszerészeti;gyógyszertár;farmacevtika;lekarništvo;farmaatsia;farmatseutiline;
key::61;healthcare;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu;
key::62;history;storia;historia;história;Geschichte;geschiedenis;geschiedkunde;ιστορία;tarih;történelem;zgodovina;ajalugu;
key::63;materials;materiali;materia;materiales;materiais;materialen;υλικά;τεκμήρια;malzemeler;anyagok;materiali;materjalid;vahendid;
key::64;economics;economia;economiche;economica;economía;economia;Wirtschaft;economie;οικονομικά;οικονομικέςεπιστήμες;ekonomi;közgazdaságtan;gospodarstvo;ekonomija;majanduslik;majandus;
key::65;therapeutics;terapeutica;terapéutica;terapêutica;therapie;θεραπευτική;tedavibilimi;gyógykezelés;terapevtika;terapeutiline;ravi;
key::66;oncology;oncologia;oncologico;oncología;oncologia;Onkologie;oncologie;ογκολογία;onkoloji;onkológia;onkologija;onkoloogia;
key::67;natural;naturali;naturale;natural;natural;natürlich;natuurlijk;φυσικός;φυσική;φυσικό;φυσικά;doğal;természetes;naraven;loodus;
key::68;educational;educazione;pedagogia;educacional;educativo;educacional;pädagogisch;educatief;εκπαιδευτικός;εκπαιδευτική;εκπαιδευτικό;εκπαιδευτικά;eğitimsel;oktatási;izobraževalen;haridus;hariduslik;
key::69;biomedical;biomedica;biomédico;biomédico;biomedizinisch;biomedisch;βιοιατρικός;βιοιατρική;βιοιατρικό;βιοιατρικά;biyomedikal;orvosbiológiai;biomedicinski;biomeditsiiniline;
key::70;veterinary;veterinaria;veterinarie;veterinaria;veterinária;tierärtzlich;veterinair;veeartsenijlkunde;κτηνιατρικός;κτηνιατρική;κτηνιατρικό;κτηνιατρικά;veteriner;állatorvosi;veterinar;veterinarski;veterinaaria;
key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia;
key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek;
key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia;
key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar;
key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline;
key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika;
key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus;
key::78;maritime;marittima;marittime;marittimo;marítimo;marítimo;maritiem;ναυτικός;ναυτική;ναυτικό;ναυτικά;ναυτιλιακός;ναυτιλιακή;ναυτιλιακό;ναυτιλιακά;θαλάσσιος;θαλάσσια;θαλάσσιο;denizcilik;tengeri;morski;mere;merendus;
key::79;sports;sport;deportes;esportes;Sport;sport;sportwetenschappen;άθληση;γυμναστικήδραστηριότητα;spor;sport;šport;sport;spordi;
key::80;surgery;chirurgia;chirurgiche;cirugía;cirurgia;Chirurgie;chirurgie;heelkunde;εγχείρηση;επέμβαση;χειρουργικήεπέμβαση;cerrahi;sebészet;kirurgija;kirurgia;
key::81;cultural;culturale;culturali;cultura;cultural;cultural;kulturell;cultureel;πολιτιστικός;πολιτιστική;πολιτιστικό;πολιτισμικός;πολιτισμική;πολιτισμικό;kültürel;kultúrális;kulturni;kultuuri;kultuuriline;
key::82;computerscience;informatica;ordenador;computadora;informática;computación;cienciasdelacomputación;ciênciadacomputação;Computer;computer;υπολογιστής;ηλεκτρονικόςυπολογιστής;bilgisayar;számítógép;računalnik;arvuti;
key::83;finance;financial;finanza;finanziarie;finanza;financiero;finanças;financeiro;Finanzen;finanziell;financiën;financieel;χρηματοοικονομικά;χρηματοδότηση;finanse;finansal;pénzügy;pénzügyi;finance;finančni;finants;finantsiline;
key::84;communication;comunicazione;comuniciación;comunicação;Kommunikation;communication;επικοινωνία;iletişim;kommunikáció;komuniciranje;kommunikatsioon;
key::85;justice;giustizia;justicia;justiça;Recht;Justiz;justitie;gerechtigheid;δικαιοσύνη;υπουργείοδικαιοσύνης;δίκαιο;adalet;igazságügy;pravo;õigus;
key::86;aerospace;aerospaziale;aerospaziali;aeroespacio;aeroespaço;Luftfahrt;luchtvaart;ruimtevaart;αεροπορικός;αεροπορική;αεροπορικό;αεροναυπηγικός;αεροναυπηγική;αεροναυπηγικό;αεροναυπηγικά;havacılıkveuzay;légtér;zrakoplovstvo;atmosfäär;kosmos;
key::87;dermatology;dermatologia;dermatología;dermatologia;Dermatologie;dermatologie;δρματολογία;dermatoloji;bőrgyógyászat;dermatológia;dermatologija;dermatoloogia;
key::88;architecture;architettura;arquitectura;arquitetura;Architektur;architectuur;αρχιτεκτονική;mimarlık;építészet;arhitektura;arhitektuur;
key::89;mathematics;matematica;matematiche;matemáticas;matemáticas;Mathematik;wiskunde;mathematica;μαθηματικά;matematik;matematika;matematika;matemaatika;
key::90;language;lingue;linguistica;linguistiche;lenguaje;idioma;língua;idioma;Sprache;taal;taalkunde;γλώσσα;dil;nyelv;jezik;keel;
key::91;neuroscience;neuroscienza;neurociencia;neurociência;Neurowissenschaft;neurowetenschappen;νευροεπιστήμη;nörobilim;idegtudomány;nevroznanost;neuroteadused;
key::92;automation;automazione;automatización;automação;Automatisierung;automatisering;αυτοματοποίηση;otomasyon;automatizálás;avtomatizacija;automatiseeritud;
key::93;pediatric;pediatria;pediatriche;pediatrico;pediátrico;pediatría;pediátrico;pediatria;pädiatrisch;pediatrische;παιδιατρική;pediatrik;gyermekgyógyászat;pediatrija;pediaatria;
key::94;photonics;fotonica;fotoniche;fotónica;fotônica;Photonik;fotonica;φωτονική;fotonik;fotonika;fotonika;fotoonika;
key::95;mechanics;meccanica;meccaniche;mecánica;mecânica;Mechanik;Maschinenbau;mechanica;werktuigkunde;μηχανικής;mekanik;gépészet;mehanika;mehaanika;
key::96;psychiatrics;psichiatria;psichiatrica;psichiatriche;psiquiatría;psiquiatria;Psychiatrie;psychiatrie;ψυχιατρική;psikiyatrik;pszihiátria;psihiatrija;psühhaatria;
key::97;psychology;fisiologia;psicología;psicologia;Psychologie;psychologie;ψυχολογία;psikoloji;pszihológia;psihologija;psühholoogia;
key::98;automotive;industriaautomobilistica;industriadelautomóvil;automotriz;industriaautomotriz;automotivo;Automobilindustrie;autoindustrie;αυτοκίνητος;αυτοκίνητη;αυτοκίνητο;αυτοκινούμενος;αυτοκινούμενη;αυτοκινούμενο;αυτοκινητιστικός;αυτοκινητιστική;αυτοκινητιστικό;otomotiv;autóipari;samogiben;avtomobilskaindustrija;auto-;
key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neurologie;zenuwleer;νευρολογία;nöroloji;neurológia;ideggyógyászat;nevrologija;neuroloogia;
key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia;
key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
key::102;informatics;informatica;informática;informática;informatica;
key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association

1 key::1;university;università;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο
2 key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
3 key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα
4 key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
5 key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
6 key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
7 key::7;college;collegio;université;colegio;faculdade;Hochschule;Szkoła Wyższa;Высшая школа;universiteit;κολλέγιο key::7;college;collegio;université;colegio;faculdade;Hochschule;Szkoła Wyższa;Высшая школа;universiteit;κολλέγιο
8 key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα;idryma
9 key::9;center;centro;centre;centro;centro;zentrum;centrum;центр;centrum;κέντρο key::9;center;centro;centre;centro;centro;zentrum;centrum;центр;centrum;κέντρο
10 key::10;national;nazionale;national;nationale;nationaux;nationales;nacional;nacional;national;krajowy;национальный;nationaal;nationale;εθνικό key::10;national;nazionale;national;nationale;nationaux;nationales;nacional;nacional;national;krajowy;национальный;nationaal;nationale;εθνικό
11 key::11;association;associazione;association;asociación;associação;Verein;verband;stowarzyszenie;ассоциация;associatie key::11;association;associazione;association;asociación;associação;Verein;verband;stowarzyszenie;ассоциация;associatie
45 key::45;institution;istituzione;institution;институциональный;instelling;ινστιτούτο key::45;institution;istituzione;institution;институциональный;instelling;ινστιτούτο
46 key::46;division;divisione;division;отделение;divisie;τμήμα key::46;division;divisione;division;отделение;divisie;τμήμα
47 key::47;committee;comitato;comité;комитет;commissie;επιτροπή key::47;committee;comitato;comité;комитет;commissie;επιτροπή
48 key::48;promotion;promozione;продвижение;proothisis;forderung
49 key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
50 key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;
51 key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
52 key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
53 key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
54 key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia;
55 key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik;
56 key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon;
57 key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne;
58 key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
59 key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
60 key::60;pharmaceuticals;pharmacy;farmacia;farmaceutica;farmacéutica;farmacia;farmacêutica;farmácia;Pharmazeutika;Arzneimittelkunde;farmaceutica;geneesmiddelen;apotheek;φαρμακευτικός;φαρμακευτική;φαρμακευτικό;φαρμακευτικά;φαρμακείο;ilaç;eczane;gyógyszerészeti;gyógyszertár;farmacevtika;lekarništvo;farmaatsia;farmatseutiline;
61 key::61;healthcare;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu;
62 key::62;history;storia;historia;história;Geschichte;geschiedenis;geschiedkunde;ιστορία;tarih;történelem;zgodovina;ajalugu;
63 key::63;materials;materiali;materia;materiales;materiais;materialen;υλικά;τεκμήρια;malzemeler;anyagok;materiali;materjalid;vahendid;
64 key::64;economics;economia;economiche;economica;economía;economia;Wirtschaft;economie;οικονομικά;οικονομικέςεπιστήμες;ekonomi;közgazdaságtan;gospodarstvo;ekonomija;majanduslik;majandus;
65 key::65;therapeutics;terapeutica;terapéutica;terapêutica;therapie;θεραπευτική;tedavibilimi;gyógykezelés;terapevtika;terapeutiline;ravi;
66 key::66;oncology;oncologia;oncologico;oncología;oncologia;Onkologie;oncologie;ογκολογία;onkoloji;onkológia;onkologija;onkoloogia;
67 key::67;natural;naturali;naturale;natural;natural;natürlich;natuurlijk;φυσικός;φυσική;φυσικό;φυσικά;doğal;természetes;naraven;loodus;
68 key::68;educational;educazione;pedagogia;educacional;educativo;educacional;pädagogisch;educatief;εκπαιδευτικός;εκπαιδευτική;εκπαιδευτικό;εκπαιδευτικά;eğitimsel;oktatási;izobraževalen;haridus;hariduslik;
69 key::69;biomedical;biomedica;biomédico;biomédico;biomedizinisch;biomedisch;βιοιατρικός;βιοιατρική;βιοιατρικό;βιοιατρικά;biyomedikal;orvosbiológiai;biomedicinski;biomeditsiiniline;
70 key::70;veterinary;veterinaria;veterinarie;veterinaria;veterinária;tierärtzlich;veterinair;veeartsenijlkunde;κτηνιατρικός;κτηνιατρική;κτηνιατρικό;κτηνιατρικά;veteriner;állatorvosi;veterinar;veterinarski;veterinaaria;
71 key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia;
72 key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek;
73 key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia;
74 key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar;
75 key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline;
76 key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika;
77 key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus;
78 key::78;maritime;marittima;marittime;marittimo;marítimo;marítimo;maritiem;ναυτικός;ναυτική;ναυτικό;ναυτικά;ναυτιλιακός;ναυτιλιακή;ναυτιλιακό;ναυτιλιακά;θαλάσσιος;θαλάσσια;θαλάσσιο;denizcilik;tengeri;morski;mere;merendus;
79 key::79;sports;sport;deportes;esportes;Sport;sport;sportwetenschappen;άθληση;γυμναστικήδραστηριότητα;spor;sport;šport;sport;spordi;
80 key::80;surgery;chirurgia;chirurgiche;cirugía;cirurgia;Chirurgie;chirurgie;heelkunde;εγχείρηση;επέμβαση;χειρουργικήεπέμβαση;cerrahi;sebészet;kirurgija;kirurgia;
81 key::81;cultural;culturale;culturali;cultura;cultural;cultural;kulturell;cultureel;πολιτιστικός;πολιτιστική;πολιτιστικό;πολιτισμικός;πολιτισμική;πολιτισμικό;kültürel;kultúrális;kulturni;kultuuri;kultuuriline;
82 key::82;computerscience;informatica;ordenador;computadora;informática;computación;cienciasdelacomputación;ciênciadacomputação;Computer;computer;υπολογιστής;ηλεκτρονικόςυπολογιστής;bilgisayar;számítógép;računalnik;arvuti;
83 key::83;finance;financial;finanza;finanziarie;finanza;financiero;finanças;financeiro;Finanzen;finanziell;financiën;financieel;χρηματοοικονομικά;χρηματοδότηση;finanse;finansal;pénzügy;pénzügyi;finance;finančni;finants;finantsiline;
84 key::84;communication;comunicazione;comuniciación;comunicação;Kommunikation;communication;επικοινωνία;iletişim;kommunikáció;komuniciranje;kommunikatsioon;
85 key::85;justice;giustizia;justicia;justiça;Recht;Justiz;justitie;gerechtigheid;δικαιοσύνη;υπουργείοδικαιοσύνης;δίκαιο;adalet;igazságügy;pravo;õigus;
86 key::86;aerospace;aerospaziale;aerospaziali;aeroespacio;aeroespaço;Luftfahrt;luchtvaart;ruimtevaart;αεροπορικός;αεροπορική;αεροπορικό;αεροναυπηγικός;αεροναυπηγική;αεροναυπηγικό;αεροναυπηγικά;havacılıkveuzay;légtér;zrakoplovstvo;atmosfäär;kosmos;
87 key::87;dermatology;dermatologia;dermatología;dermatologia;Dermatologie;dermatologie;δρματολογία;dermatoloji;bőrgyógyászat;dermatológia;dermatologija;dermatoloogia;
88 key::88;architecture;architettura;arquitectura;arquitetura;Architektur;architectuur;αρχιτεκτονική;mimarlık;építészet;arhitektura;arhitektuur;
89 key::89;mathematics;matematica;matematiche;matemáticas;matemáticas;Mathematik;wiskunde;mathematica;μαθηματικά;matematik;matematika;matematika;matemaatika;
90 key::90;language;lingue;linguistica;linguistiche;lenguaje;idioma;língua;idioma;Sprache;taal;taalkunde;γλώσσα;dil;nyelv;jezik;keel;
91 key::91;neuroscience;neuroscienza;neurociencia;neurociência;Neurowissenschaft;neurowetenschappen;νευροεπιστήμη;nörobilim;idegtudomány;nevroznanost;neuroteadused;
92 key::92;automation;automazione;automatización;automação;Automatisierung;automatisering;αυτοματοποίηση;otomasyon;automatizálás;avtomatizacija;automatiseeritud;
93 key::93;pediatric;pediatria;pediatriche;pediatrico;pediátrico;pediatría;pediátrico;pediatria;pädiatrisch;pediatrische;παιδιατρική;pediatrik;gyermekgyógyászat;pediatrija;pediaatria;
94 key::94;photonics;fotonica;fotoniche;fotónica;fotônica;Photonik;fotonica;φωτονική;fotonik;fotonika;fotonika;fotoonika;
95 key::95;mechanics;meccanica;meccaniche;mecánica;mecânica;Mechanik;Maschinenbau;mechanica;werktuigkunde;μηχανικής;mekanik;gépészet;mehanika;mehaanika;
96 key::96;psychiatrics;psichiatria;psichiatrica;psichiatriche;psiquiatría;psiquiatria;Psychiatrie;psychiatrie;ψυχιατρική;psikiyatrik;pszihiátria;psihiatrija;psühhaatria;
97 key::97;psychology;fisiologia;psicología;psicologia;Psychologie;psychologie;ψυχολογία;psikoloji;pszihológia;psihologija;psühholoogia;
98 key::98;automotive;industriaautomobilistica;industriadelautomóvil;automotriz;industriaautomotriz;automotivo;Automobilindustrie;autoindustrie;αυτοκίνητος;αυτοκίνητη;αυτοκίνητο;αυτοκινούμενος;αυτοκινούμενη;αυτοκινούμενο;αυτοκινητιστικός;αυτοκινητιστική;αυτοκινητιστικό;otomotiv;autóipari;samogiben;avtomobilskaindustrija;auto-;
99 key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neurologie;zenuwleer;νευρολογία;nöroloji;neurológia;ideggyógyászat;nevrologija;neuroloogia;
100 key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia;
101 key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
102 key::102;informatics;informatica;informática;informática;informatica;
103 key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association

View File

@ -0,0 +1,4 @@
public class DedupTestIT {
}

View File

@ -10,6 +10,7 @@ import java.util.HashMap;
import java.util.Map;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;
public class DistanceAlgoTest extends AbstractPaceFunctions {
@ -61,5 +62,45 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
assertEquals(result, 1.0);
}
@Test
public void testJaroWinklerNormalizedName3() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna");
System.out.println("result = " + result);
assertEquals(result, 0.0);
}
@Test
public void testJaroWinklerNormalizedName4() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa");
System.out.println("result = " + result);
assertEquals(result, 1.0);
}
@Test
public void testJaroWinklerNormalizedName5() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS");
System.out.println("result = " + result);
assertEquals(result, 1.0);
}
@Test
public void testJaroWinklerNormalizedName6() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung");
System.out.println("result = " + result);
assertTrue(result> 0.9);
}
}

View File

@ -1,144 +0,0 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import org.junit.Before;
import org.junit.Test;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;
//test class for comparators (to be used into the tree nodes)
public class ComparatorTest extends AbstractPaceTest {
private Map<String, Number> params;
@Before
public void setup() {
params = new HashMap<>();
//to put all the needed parameters
params.put("minCoauthors", 5);
params.put("maxCoauthors", 200);
}
@Test
public void testCoauthorsMatch() {
final CoauthorsMatch coauthorsMatch = new CoauthorsMatch(params);
Field a = createFieldList(Arrays.asList("la bruzzo, sandro", "atzori, claudio", "artini, michele", "de bonis, michele", "bardi, alessia", "dell'amico, andrea", "baglioni, miriam"), "coauthors");
Field b = createFieldList(Arrays.asList("la bruzzo, sandro"), "coauthors");
double result1 = coauthorsMatch.compare(a, b);
double result2 = coauthorsMatch.compare(a, a);
System.out.println("a = " + a);
System.out.println("b = " + b);
System.out.println("a vs b = " + result1);
System.out.println("a vs a = " + result2);
assertEquals(result1, -1.0);
assertEquals(result2, 7.0);
}
@Test
public void testExactMatch() {
final ExactMatch exactMatch = new ExactMatch(params);
Field a = new FieldValueImpl(Type.String, "doi", "10.1000/0000000000");
Field b = new FieldValueImpl(Type.String, "doi", "10.1033/0000000000");
Field c = new FieldValueImpl(Type.String, "doi", "");
double result1 = exactMatch.compare(a,a);
double result2 = exactMatch.compare(a,b);
double result3 = exactMatch.compare(a,c);
System.out.println("a = " + a);
System.out.println("b = " + b);
System.out.println("c = " + c);
System.out.println("a vs a = " + result1);
System.out.println("a vs b = " + result2);
System.out.println("a vs c = " + result3);
assertEquals(result1, 1.0);
assertEquals(result2, 0.0);
assertEquals(result3, -1.0);
}
@Test
public void testSimilarMatch() {
final SimilarMatch similarMatch = new SimilarMatch(params);
Field a = new FieldValueImpl(Type.String, "firstname", "sandro");
Field b = new FieldValueImpl(Type.String, "firstname", "s.");
Field c = new FieldValueImpl(Type.String, "firstname", "stefano");
double result1 = similarMatch.compare(a,b);
double result2 = similarMatch.compare(a,c);
double result3 = similarMatch.compare(b,c);
System.out.println("a = " + a);
System.out.println("b = " + b);
System.out.println("c = " + c);
System.out.println("a vs b = " + result1);
System.out.println("a vs c = " + result2);
System.out.println("b vs c = " + result3);
assertEquals(result1, 1.0);
assertEquals(result3, 1.0);
assertTrue(result2<0.7);
}
@Test
public void testTopicsMatch() {
final TopicsMatch topicsMatch = new TopicsMatch(params);
Field a = createFieldList(Arrays.asList("0.0", "1.0", "0.0"), "topics");
Field b = createFieldList(Arrays.asList("0.0", "0.0", "1.0"), "topics");
Field c = createFieldList(Arrays.asList("0.5", "0.5", "0.0"), "topics");
double result1 = topicsMatch.compare(a,a);
double result2 = topicsMatch.compare(a,c);
double result3 = topicsMatch.compare(b,c);
System.out.println("a = " + a);
System.out.println("b = " + b);
System.out.println("c = " + c);
System.out.println("a vs a = " + result1);
System.out.println("a vs c = " + result2);
System.out.println("b vs c = " + result3);
assertEquals(result1, 1.0);
assertEquals(result2, 0.5);
assertEquals(result3, 0.0);
}
@Test
public void testUndefinedNode() {
final UndefinedNode undefinedNode = new UndefinedNode();
double result = undefinedNode.compare(new FieldListImpl(),new FieldListImpl());
assertEquals(result, 0.0);
}
}