From 662448e5841c727449c113bf12eef4eb6433b44b Mon Sep 17 00:00:00 2001 From: Michele De Bonis Date: Thu, 21 Mar 2019 14:27:27 +0100 Subject: [PATCH] update of the comparator for legalnames of organizations --- .../AbstractClusteringFunction.java | 15 +- .../pace/common/AbstractPaceFunctions.java | 52 +++++-- .../algo/JaroWinklerNormalizedName.java | 32 ++-- .../eu/dnetlib/pace/util/BlockProcessor.java | 58 +------ .../eu/dnetlib/pace/config/city_map.csv | 2 +- .../eu/dnetlib/pace/config/stopwords_it.txt | 1 - .../dnetlib/pace/config/translation_map.csv | 64 +++++++- dnet-pace-core/src/test/java/DedupTestIT.java | 4 + .../pace/distance/DistanceAlgoTest.java | 41 +++++ .../eu/dnetlib/pace/tree/ComparatorTest.java | 144 ------------------ 10 files changed, 165 insertions(+), 248 deletions(-) create mode 100644 dnet-pace-core/src/test/java/DedupTestIT.java delete mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/tree/ComparatorTest.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index f9192ad51..1782b8761 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -1,16 +1,15 @@ package eu.dnetlib.pace.clustering; -import java.util.*; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -import com.google.common.collect.Sets; - import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { protected Map params; @@ -26,7 +25,7 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i return fields.stream().filter(f -> !f.isEmpty()) .map(Field::stringValue) .map(this::normalize) - .map(s -> filterStopWords(s, stopwords)) + .map(s -> filterAllStopWords(s)) .map(this::doApply) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 757241192..977c5c46e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -27,7 +27,12 @@ import java.util.regex.Pattern; */ public abstract class AbstractPaceFunctions { - protected static Set stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); + protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); + protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt"); + protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); + protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); @@ -42,8 +47,9 @@ public abstract class AbstractPaceFunctions { } protected String cleanup(final String s) { - final String s1 = nfd(s); - final String s2 = fixAliases(s1); + final String s0 = s.toLowerCase(); + final String s1 = fixAliases(s0); + final String s2 = nfd(s1); final String s3 = s2.replaceAll("–", " "); final String s4 = s3.replaceAll("&", " "); final String s5 = s4.replaceAll(""", " "); @@ -140,6 +146,18 @@ public abstract class AbstractPaceFunctions { return sb.toString().trim(); } + protected String filterAllStopWords(String s) { + + s = filterStopWords(s, stopwords_en); + s = filterStopWords(s, stopwords_de); + s = filterStopWords(s, stopwords_it); + s = filterStopWords(s, stopwords_fr); + s = filterStopWords(s, stopwords_pt); + s = filterStopWords(s, stopwords_es); + + return s; + } + protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) { final Set newset = Sets.newLinkedHashSet(); for (final String s : set) { @@ -192,15 +210,7 @@ public abstract class AbstractPaceFunctions { return sb.toString().trim(); } - public String normalizeCities(String s1, Map cityMap){ - //TODO change normalization mode - - for (String city : cityMap.keySet()) - s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " "); - return s1; - } - - public String normalizeCities2 (String s1, Map cityMap, int windowSize){ + public String keywordsToCode(String s1, Map translationMap, int windowSize){ List tokens = Arrays.asList(s1.split(" ")); @@ -213,9 +223,8 @@ public abstract class AbstractPaceFunctions { for (int i = 0; i<=tokens.size()-length; i++){ String candidate = Joiner.on(" ").join(tokens.subList(i, i + length)); - if (cityMap.containsKey(candidate)) { - s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + cityMap.get(candidate) + " "); - return s1; + if (translationMap.containsKey(candidate)) { + s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + translationMap.get(candidate) + " "); } } length-=1; @@ -229,9 +238,20 @@ public abstract class AbstractPaceFunctions { final String regexKey = "\\bkey::[0-9]*\\b"; final String regexCity = "\\bcity::[0-9]*\\b"; return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim(); - } + public double keywordsCompare(String s1, String s2){ + + List keywords1 = getKeywords(s1); + List keywords2 = getKeywords(s2); + int longer = (keywords1.size()>keywords2.size())?keywords1.size():keywords2.size(); + + if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty()) + return 1.0; + else + return (double)CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()/(double)longer; + } + //check if 2 strings have same keywords public boolean sameKeywords(String s1, String s2){ //at least 1 keyword in common diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java index ebaa0eab4..fea74af37 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java @@ -47,27 +47,25 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { cb = removeStopwords(cb); //replace keywords with codes - ca = translate(ca, translationMap); - cb = translate(cb, translationMap); + String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue()); + String codesB = keywordsToCode(cb, translationMap, params.getOrDefault("windowSize",4).intValue()); //replace cities with codes -// String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap); -// ca = norm.split("\\|\\|\\|")[0].trim(); -// cb = norm.split("\\|\\|\\|")[1].trim(); + codesA = keywordsToCode(codesA, cityMap, params.getOrDefault("windowSize", 4).intValue()); + codesB = keywordsToCode(codesB, cityMap, params.getOrDefault("windowSize", 4).intValue()); - ca = normalizeCities2(ca, cityMap, params.getOrDefault("windowSize", 4).intValue()); - cb = normalizeCities2(cb, cityMap, params.getOrDefault("windowSize", 4).intValue()); - - if (sameCity(ca,cb)){ - if (sameKeywords(ca,cb)){ - ca = removeCodes(ca); - cb = removeCodes(cb); - if (ca.isEmpty() && cb.isEmpty()) - return 1.0; - else - return normalize(ssalgo.score(ca,cb)); - } + //if two names have same city + if (sameCity(codesA,codesB)){ + if (keywordsCompare(codesA, codesB)>params.getOrDefault("threshold", 0.5).doubleValue()) { + ca = removeCodes(codesA); + cb = removeCodes(codesB); + if (ca.isEmpty() && cb.isEmpty()) + return 1.0; + else + return normalize(ssalgo.score(ca,cb)); + } } + return 0.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 1cd0eb3af..01da9c227 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -9,8 +9,6 @@ import eu.dnetlib.pace.distance.eval.ScoreResult; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocumentComparator; -import eu.dnetlib.pace.model.TreeNodeDef; -import eu.dnetlib.pace.tree.support.MatchType; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -45,67 +43,13 @@ public class BlockProcessor { if (q.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); //process(q, context); + process(simplifyQueue(q, key, context), context); - //process the decision tree if it is specified, otherwise go with conditions and distance algos - if (!dedupConf.getPace().getDecisionTree().isEmpty()){ - processPersons(q, context); - } - else { - process(simplifyQueue(q, key, context), context); - } } else { context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); } } - private void processPersons(final Queue queue, final Reporter context) { - - while (!queue.isEmpty()) { - - final MapDocument pivot = queue.remove(); //take first element of the queue - final String idPivot = pivot.getIdentifier(); - - //compare the first element with all the others - for (final MapDocument curr : queue) { - final String idCurr = curr.getIdentifier(); - - //check if pivot and current element are similar by processing the tree - if (navigateTree(pivot, curr)!=MatchType.NO_MATCH) - writeSimilarity(context, idPivot, idCurr); - } - } - } - - public MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){ - - final Map decisionTree = dedupConf.getPace().getDecisionTree(); - - String current = "start"; - - while (MatchType.getEnum(current)==MatchType.UNDEFINED) { - - TreeNodeDef currentNode = decisionTree.get(current); - //throw an exception if the node doesn't exist - if (currentNode == null) - throw new PaceException("The Tree Node doesn't exist: " + current); - - double similarity = currentNode.evaluate(doc1, doc2); - - if (similarity == -1) { - current = currentNode.getUndefined(); - } - else if (similarity>=currentNode.getThreshold()){ - current = currentNode.getPositive(); - } - else { - current = currentNode.getNegative(); - } - - } - - return MatchType.getEnum(current); - } - private Queue prepare(final Iterable documents) { final Queue queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField())); diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv index 720b82302..5f70a373b 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv @@ -8595,4 +8595,4 @@ city::890299;Harare;Arare;Charare;Gorad Kharareh;HRE;Harare;Hararensis Urbs;Hara city::890422;Gweru;GWE;Gveru;Gwelo;Gweru;Гверу; city::893697;Chinhoyi;Chinhoyi;Chinkhoi;Chinkhoji;Chinoyi;Cinhojis;Sinoia;qi nuo yi;Činhojis;Чинхойи;Чинхої;Чинхоји;چینہوئی;奇諾伊; city::894701;Bulawayo;BUQ;Bulavajas;Bulavajo;Bulavejo;Bulawayo;Gorad Bulavajo;bu la wa yue;bullawayo;burawayo;Булавайо;Булавајо;Булавейо;Горад Булаваё;ブラワヨ;布拉瓦约;불라와요; -city::1106542;Chitungwiza;Chitungviza;Chitungwiza;Citungviza;Gorad Chytungviza;chytwngwyza;Čitungviza;Горад Чытунгвіза;Читунгвиза;Читунгвіза;چیتونگویزا; +city::1106542;Chitungwiza;Chitungviza;Chitungwiza;Citungviza;Gorad Chytungviza;chytwngwyza;Čitungviza;Горад Чытунгвіза;Читунгвиза;Читунгвіза;چیتونگویزا; \ No newline at end of file diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt index 2003b42d2..2ce975b13 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt @@ -611,7 +611,6 @@ terzo th ti titolo -torino tra tranne tre diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index cd1479688..55d114c79 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -1,11 +1,11 @@ -key::1;university;università;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο +key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο -key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα +key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas key::7;college;collegio;université;colegio;faculdade;Hochschule;Szkoła Wyższa;Высшая школа;universiteit;κολλέγιο -key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα +key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα;idryma key::9;center;centro;centre;centro;centro;zentrum;centrum;центр;centrum;κέντρο key::10;national;nazionale;national;nationale;nationaux;nationales;nacional;nacional;national;krajowy;национальный;nationaal;nationale;εθνικό key::11;association;associazione;association;asociación;associação;Verein;verband;stowarzyszenie;ассоциация;associatie @@ -44,4 +44,60 @@ key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρω key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί key::45;institution;istituzione;institution;институциональный;instelling;ινστιτούτο key::46;division;divisione;division;отделение;divisie;τμήμα -key::47;committee;comitato;comité;комитет;commissie;επιτροπή \ No newline at end of file +key::47;committee;comitato;comité;комитет;commissie;επιτροπή +key::48;promotion;promozione;продвижение;proothisis;forderung +key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline; +key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline; +key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik; +key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri; +key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus; +key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia; +key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik; +key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon; +key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne; +key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;; +key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri; +key::60;pharmaceuticals;pharmacy;farmacia;farmaceutica;farmacéutica;farmacia;farmacêutica;farmácia;Pharmazeutika;Arzneimittelkunde;farmaceutica;geneesmiddelen;apotheek;φαρμακευτικός;φαρμακευτική;φαρμακευτικό;φαρμακευτικά;φαρμακείο;ilaç;eczane;gyógyszerészeti;gyógyszertár;farmacevtika;lekarništvo;farmaatsia;farmatseutiline; +key::61;healthcare;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu; +key::62;history;storia;historia;história;Geschichte;geschiedenis;geschiedkunde;ιστορία;tarih;történelem;zgodovina;ajalugu; +key::63;materials;materiali;materia;materiales;materiais;materialen;υλικά;τεκμήρια;malzemeler;anyagok;materiali;materjalid;vahendid; +key::64;economics;economia;economiche;economica;economía;economia;Wirtschaft;economie;οικονομικά;οικονομικέςεπιστήμες;ekonomi;közgazdaságtan;gospodarstvo;ekonomija;majanduslik;majandus; +key::65;therapeutics;terapeutica;terapéutica;terapêutica;therapie;θεραπευτική;tedavibilimi;gyógykezelés;terapevtika;terapeutiline;ravi; +key::66;oncology;oncologia;oncologico;oncología;oncologia;Onkologie;oncologie;ογκολογία;onkoloji;onkológia;onkologija;onkoloogia; +key::67;natural;naturali;naturale;natural;natural;natürlich;natuurlijk;φυσικός;φυσική;φυσικό;φυσικά;doğal;természetes;naraven;loodus; +key::68;educational;educazione;pedagogia;educacional;educativo;educacional;pädagogisch;educatief;εκπαιδευτικός;εκπαιδευτική;εκπαιδευτικό;εκπαιδευτικά;eğitimsel;oktatási;izobraževalen;haridus;hariduslik; +key::69;biomedical;biomedica;biomédico;biomédico;biomedizinisch;biomedisch;βιοιατρικός;βιοιατρική;βιοιατρικό;βιοιατρικά;biyomedikal;orvosbiológiai;biomedicinski;biomeditsiiniline; +key::70;veterinary;veterinaria;veterinarie;veterinaria;veterinária;tierärtzlich;veterinair;veeartsenijlkunde;κτηνιατρικός;κτηνιατρική;κτηνιατρικό;κτηνιατρικά;veteriner;állatorvosi;veterinar;veterinarski;veterinaaria; +key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia; +key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek; +key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia; +key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar; +key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline; +key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika; +key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus; +key::78;maritime;marittima;marittime;marittimo;marítimo;marítimo;maritiem;ναυτικός;ναυτική;ναυτικό;ναυτικά;ναυτιλιακός;ναυτιλιακή;ναυτιλιακό;ναυτιλιακά;θαλάσσιος;θαλάσσια;θαλάσσιο;denizcilik;tengeri;morski;mere;merendus; +key::79;sports;sport;deportes;esportes;Sport;sport;sportwetenschappen;άθληση;γυμναστικήδραστηριότητα;spor;sport;šport;sport;spordi; +key::80;surgery;chirurgia;chirurgiche;cirugía;cirurgia;Chirurgie;chirurgie;heelkunde;εγχείρηση;επέμβαση;χειρουργικήεπέμβαση;cerrahi;sebészet;kirurgija;kirurgia; +key::81;cultural;culturale;culturali;cultura;cultural;cultural;kulturell;cultureel;πολιτιστικός;πολιτιστική;πολιτιστικό;πολιτισμικός;πολιτισμική;πολιτισμικό;kültürel;kultúrális;kulturni;kultuuri;kultuuriline; +key::82;computerscience;informatica;ordenador;computadora;informática;computación;cienciasdelacomputación;ciênciadacomputação;Computer;computer;υπολογιστής;ηλεκτρονικόςυπολογιστής;bilgisayar;számítógép;računalnik;arvuti; +key::83;finance;financial;finanza;finanziarie;finanza;financiero;finanças;financeiro;Finanzen;finanziell;financiën;financieel;χρηματοοικονομικά;χρηματοδότηση;finanse;finansal;pénzügy;pénzügyi;finance;finančni;finants;finantsiline; +key::84;communication;comunicazione;comuniciación;comunicação;Kommunikation;communication;επικοινωνία;iletişim;kommunikáció;komuniciranje;kommunikatsioon; +key::85;justice;giustizia;justicia;justiça;Recht;Justiz;justitie;gerechtigheid;δικαιοσύνη;υπουργείοδικαιοσύνης;δίκαιο;adalet;igazságügy;pravo;õigus; +key::86;aerospace;aerospaziale;aerospaziali;aeroespacio;aeroespaço;Luftfahrt;luchtvaart;ruimtevaart;αεροπορικός;αεροπορική;αεροπορικό;αεροναυπηγικός;αεροναυπηγική;αεροναυπηγικό;αεροναυπηγικά;havacılıkveuzay;légtér;zrakoplovstvo;atmosfäär;kosmos; +key::87;dermatology;dermatologia;dermatología;dermatologia;Dermatologie;dermatologie;δρματολογία;dermatoloji;bőrgyógyászat;dermatológia;dermatologija;dermatoloogia; +key::88;architecture;architettura;arquitectura;arquitetura;Architektur;architectuur;αρχιτεκτονική;mimarlık;építészet;arhitektura;arhitektuur; +key::89;mathematics;matematica;matematiche;matemáticas;matemáticas;Mathematik;wiskunde;mathematica;μαθηματικά;matematik;matematika;matematika;matemaatika; +key::90;language;lingue;linguistica;linguistiche;lenguaje;idioma;língua;idioma;Sprache;taal;taalkunde;γλώσσα;dil;nyelv;jezik;keel; +key::91;neuroscience;neuroscienza;neurociencia;neurociência;Neurowissenschaft;neurowetenschappen;νευροεπιστήμη;nörobilim;idegtudomány;nevroznanost;neuroteadused; +key::92;automation;automazione;automatización;automação;Automatisierung;automatisering;αυτοματοποίηση;otomasyon;automatizálás;avtomatizacija;automatiseeritud; +key::93;pediatric;pediatria;pediatriche;pediatrico;pediátrico;pediatría;pediátrico;pediatria;pädiatrisch;pediatrische;παιδιατρική;pediatrik;gyermekgyógyászat;pediatrija;pediaatria; +key::94;photonics;fotonica;fotoniche;fotónica;fotônica;Photonik;fotonica;φωτονική;fotonik;fotonika;fotonika;fotoonika; +key::95;mechanics;meccanica;meccaniche;mecánica;mecânica;Mechanik;Maschinenbau;mechanica;werktuigkunde;μηχανικής;mekanik;gépészet;mehanika;mehaanika; +key::96;psychiatrics;psichiatria;psichiatrica;psichiatriche;psiquiatría;psiquiatria;Psychiatrie;psychiatrie;ψυχιατρική;psikiyatrik;pszihiátria;psihiatrija;psühhaatria; +key::97;psychology;fisiologia;psicología;psicologia;Psychologie;psychologie;ψυχολογία;psikoloji;pszihológia;psihologija;psühholoogia; +key::98;automotive;industriaautomobilistica;industriadelautomóvil;automotriz;industriaautomotriz;automotivo;Automobilindustrie;autoindustrie;αυτοκίνητος;αυτοκίνητη;αυτοκίνητο;αυτοκινούμενος;αυτοκινούμενη;αυτοκινούμενο;αυτοκινητιστικός;αυτοκινητιστική;αυτοκινητιστικό;otomotiv;autóipari;samogiben;avtomobilskaindustrija;auto-; +key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neurologie;zenuwleer;νευρολογία;nöroloji;neurológia;ideggyógyászat;nevrologija;neuroloogia; +key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia; +key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia; +key::102;informatics;informatica;informática;informática;informatica; +key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association diff --git a/dnet-pace-core/src/test/java/DedupTestIT.java b/dnet-pace-core/src/test/java/DedupTestIT.java new file mode 100644 index 000000000..f9f1ed328 --- /dev/null +++ b/dnet-pace-core/src/test/java/DedupTestIT.java @@ -0,0 +1,4 @@ +public class DedupTestIT { + + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 883dde57c..c92c6fed3 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -10,6 +10,7 @@ import java.util.HashMap; import java.util.Map; import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; public class DistanceAlgoTest extends AbstractPaceFunctions { @@ -61,5 +62,45 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { assertEquals(result, 1.0); } + @Test + public void testJaroWinklerNormalizedName3() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna"); + + System.out.println("result = " + result); + assertEquals(result, 0.0); + } + + @Test + public void testJaroWinklerNormalizedName4() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa"); + + System.out.println("result = " + result); + assertEquals(result, 1.0); + } + + @Test + public void testJaroWinklerNormalizedName5() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS"); + + System.out.println("result = " + result); + assertEquals(result, 1.0); + } + + @Test + public void testJaroWinklerNormalizedName6() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung"); + + System.out.println("result = " + result); + assertTrue(result> 0.9); + + } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/tree/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/tree/ComparatorTest.java deleted file mode 100644 index 240a5d6bd..000000000 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/tree/ComparatorTest.java +++ /dev/null @@ -1,144 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.AbstractPaceTest; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; -import org.junit.Before; -import org.junit.Test; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - -import static junit.framework.Assert.assertEquals; -import static junit.framework.Assert.assertTrue; - -//test class for comparators (to be used into the tree nodes) -public class ComparatorTest extends AbstractPaceTest { - - private Map params; - - @Before - public void setup() { - params = new HashMap<>(); - //to put all the needed parameters - params.put("minCoauthors", 5); - params.put("maxCoauthors", 200); - - } - - @Test - public void testCoauthorsMatch() { - - final CoauthorsMatch coauthorsMatch = new CoauthorsMatch(params); - - Field a = createFieldList(Arrays.asList("la bruzzo, sandro", "atzori, claudio", "artini, michele", "de bonis, michele", "bardi, alessia", "dell'amico, andrea", "baglioni, miriam"), "coauthors"); - Field b = createFieldList(Arrays.asList("la bruzzo, sandro"), "coauthors"); - - double result1 = coauthorsMatch.compare(a, b); - double result2 = coauthorsMatch.compare(a, a); - - System.out.println("a = " + a); - System.out.println("b = " + b); - - System.out.println("a vs b = " + result1); - System.out.println("a vs a = " + result2); - - assertEquals(result1, -1.0); - assertEquals(result2, 7.0); - } - - @Test - public void testExactMatch() { - - final ExactMatch exactMatch = new ExactMatch(params); - - Field a = new FieldValueImpl(Type.String, "doi", "10.1000/0000000000"); - Field b = new FieldValueImpl(Type.String, "doi", "10.1033/0000000000"); - Field c = new FieldValueImpl(Type.String, "doi", ""); - - double result1 = exactMatch.compare(a,a); - double result2 = exactMatch.compare(a,b); - double result3 = exactMatch.compare(a,c); - - System.out.println("a = " + a); - System.out.println("b = " + b); - System.out.println("c = " + c); - - System.out.println("a vs a = " + result1); - System.out.println("a vs b = " + result2); - System.out.println("a vs c = " + result3); - - assertEquals(result1, 1.0); - assertEquals(result2, 0.0); - assertEquals(result3, -1.0); - - } - - @Test - public void testSimilarMatch() { - - final SimilarMatch similarMatch = new SimilarMatch(params); - - Field a = new FieldValueImpl(Type.String, "firstname", "sandro"); - Field b = new FieldValueImpl(Type.String, "firstname", "s."); - Field c = new FieldValueImpl(Type.String, "firstname", "stefano"); - - double result1 = similarMatch.compare(a,b); - double result2 = similarMatch.compare(a,c); - double result3 = similarMatch.compare(b,c); - - System.out.println("a = " + a); - System.out.println("b = " + b); - System.out.println("c = " + c); - - System.out.println("a vs b = " + result1); - System.out.println("a vs c = " + result2); - System.out.println("b vs c = " + result3); - - assertEquals(result1, 1.0); - assertEquals(result3, 1.0); - assertTrue(result2<0.7); - - } - - @Test - public void testTopicsMatch() { - - final TopicsMatch topicsMatch = new TopicsMatch(params); - - Field a = createFieldList(Arrays.asList("0.0", "1.0", "0.0"), "topics"); - Field b = createFieldList(Arrays.asList("0.0", "0.0", "1.0"), "topics"); - Field c = createFieldList(Arrays.asList("0.5", "0.5", "0.0"), "topics"); - - double result1 = topicsMatch.compare(a,a); - double result2 = topicsMatch.compare(a,c); - double result3 = topicsMatch.compare(b,c); - - System.out.println("a = " + a); - System.out.println("b = " + b); - System.out.println("c = " + c); - - System.out.println("a vs a = " + result1); - System.out.println("a vs c = " + result2); - System.out.println("b vs c = " + result3); - - assertEquals(result1, 1.0); - assertEquals(result2, 0.5); - assertEquals(result3, 0.0); - - } - - @Test - public void testUndefinedNode() { - - final UndefinedNode undefinedNode = new UndefinedNode(); - double result = undefinedNode.compare(new FieldListImpl(),new FieldListImpl()); - - assertEquals(result, 0.0); - } - - -}