forked from D-Net/dnet-hadoop
update of the comparator for legalnames of organizations
This commit is contained in:
parent
0735f3a822
commit
662448e584
|
@ -1,16 +1,15 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.function.Predicate;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
|
||||||
protected Map<String, Integer> params;
|
protected Map<String, Integer> params;
|
||||||
|
@ -26,7 +25,7 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
||||||
return fields.stream().filter(f -> !f.isEmpty())
|
return fields.stream().filter(f -> !f.isEmpty())
|
||||||
.map(Field::stringValue)
|
.map(Field::stringValue)
|
||||||
.map(this::normalize)
|
.map(this::normalize)
|
||||||
.map(s -> filterStopWords(s, stopwords))
|
.map(s -> filterAllStopWords(s))
|
||||||
.map(this::doApply)
|
.map(this::doApply)
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
.flatMap(c -> c.stream())
|
.flatMap(c -> c.stream())
|
||||||
|
|
|
@ -27,7 +27,12 @@ import java.util.regex.Pattern;
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractPaceFunctions {
|
public abstract class AbstractPaceFunctions {
|
||||||
|
|
||||||
protected static Set<String> stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||||
|
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
||||||
|
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
||||||
|
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
|
||||||
|
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||||
|
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||||
|
|
||||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||||
|
|
||||||
|
@ -42,8 +47,9 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String cleanup(final String s) {
|
protected String cleanup(final String s) {
|
||||||
final String s1 = nfd(s);
|
final String s0 = s.toLowerCase();
|
||||||
final String s2 = fixAliases(s1);
|
final String s1 = fixAliases(s0);
|
||||||
|
final String s2 = nfd(s1);
|
||||||
final String s3 = s2.replaceAll("–", " ");
|
final String s3 = s2.replaceAll("–", " ");
|
||||||
final String s4 = s3.replaceAll("&", " ");
|
final String s4 = s3.replaceAll("&", " ");
|
||||||
final String s5 = s4.replaceAll(""", " ");
|
final String s5 = s4.replaceAll(""", " ");
|
||||||
|
@ -140,6 +146,18 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString().trim();
|
return sb.toString().trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String filterAllStopWords(String s) {
|
||||||
|
|
||||||
|
s = filterStopWords(s, stopwords_en);
|
||||||
|
s = filterStopWords(s, stopwords_de);
|
||||||
|
s = filterStopWords(s, stopwords_it);
|
||||||
|
s = filterStopWords(s, stopwords_fr);
|
||||||
|
s = filterStopWords(s, stopwords_pt);
|
||||||
|
s = filterStopWords(s, stopwords_es);
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
||||||
final Set<String> newset = Sets.newLinkedHashSet();
|
final Set<String> newset = Sets.newLinkedHashSet();
|
||||||
for (final String s : set) {
|
for (final String s : set) {
|
||||||
|
@ -192,15 +210,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString().trim();
|
return sb.toString().trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String normalizeCities(String s1, Map<String,String> cityMap){
|
public String keywordsToCode(String s1, Map<String, String> translationMap, int windowSize){
|
||||||
//TODO change normalization mode
|
|
||||||
|
|
||||||
for (String city : cityMap.keySet())
|
|
||||||
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
|
|
||||||
return s1;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String normalizeCities2 (String s1, Map<String, String> cityMap, int windowSize){
|
|
||||||
|
|
||||||
List<String> tokens = Arrays.asList(s1.split(" "));
|
List<String> tokens = Arrays.asList(s1.split(" "));
|
||||||
|
|
||||||
|
@ -213,9 +223,8 @@ public abstract class AbstractPaceFunctions {
|
||||||
|
|
||||||
for (int i = 0; i<=tokens.size()-length; i++){
|
for (int i = 0; i<=tokens.size()-length; i++){
|
||||||
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
|
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
|
||||||
if (cityMap.containsKey(candidate)) {
|
if (translationMap.containsKey(candidate)) {
|
||||||
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + cityMap.get(candidate) + " ");
|
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + translationMap.get(candidate) + " ");
|
||||||
return s1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
length-=1;
|
length-=1;
|
||||||
|
@ -229,9 +238,20 @@ public abstract class AbstractPaceFunctions {
|
||||||
final String regexKey = "\\bkey::[0-9]*\\b";
|
final String regexKey = "\\bkey::[0-9]*\\b";
|
||||||
final String regexCity = "\\bcity::[0-9]*\\b";
|
final String regexCity = "\\bcity::[0-9]*\\b";
|
||||||
return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim();
|
return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public double keywordsCompare(String s1, String s2){
|
||||||
|
|
||||||
|
List<String> keywords1 = getKeywords(s1);
|
||||||
|
List<String> keywords2 = getKeywords(s2);
|
||||||
|
int longer = (keywords1.size()>keywords2.size())?keywords1.size():keywords2.size();
|
||||||
|
|
||||||
|
if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty())
|
||||||
|
return 1.0;
|
||||||
|
else
|
||||||
|
return (double)CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()/(double)longer;
|
||||||
|
}
|
||||||
|
|
||||||
//check if 2 strings have same keywords
|
//check if 2 strings have same keywords
|
||||||
public boolean sameKeywords(String s1, String s2){
|
public boolean sameKeywords(String s1, String s2){
|
||||||
//at least 1 keyword in common
|
//at least 1 keyword in common
|
||||||
|
|
|
@ -47,27 +47,25 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
cb = removeStopwords(cb);
|
cb = removeStopwords(cb);
|
||||||
|
|
||||||
//replace keywords with codes
|
//replace keywords with codes
|
||||||
ca = translate(ca, translationMap);
|
String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue());
|
||||||
cb = translate(cb, translationMap);
|
String codesB = keywordsToCode(cb, translationMap, params.getOrDefault("windowSize",4).intValue());
|
||||||
|
|
||||||
//replace cities with codes
|
//replace cities with codes
|
||||||
// String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
|
codesA = keywordsToCode(codesA, cityMap, params.getOrDefault("windowSize", 4).intValue());
|
||||||
// ca = norm.split("\\|\\|\\|")[0].trim();
|
codesB = keywordsToCode(codesB, cityMap, params.getOrDefault("windowSize", 4).intValue());
|
||||||
// cb = norm.split("\\|\\|\\|")[1].trim();
|
|
||||||
|
|
||||||
ca = normalizeCities2(ca, cityMap, params.getOrDefault("windowSize", 4).intValue());
|
//if two names have same city
|
||||||
cb = normalizeCities2(cb, cityMap, params.getOrDefault("windowSize", 4).intValue());
|
if (sameCity(codesA,codesB)){
|
||||||
|
if (keywordsCompare(codesA, codesB)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||||
if (sameCity(ca,cb)){
|
ca = removeCodes(codesA);
|
||||||
if (sameKeywords(ca,cb)){
|
cb = removeCodes(codesB);
|
||||||
ca = removeCodes(ca);
|
if (ca.isEmpty() && cb.isEmpty())
|
||||||
cb = removeCodes(cb);
|
return 1.0;
|
||||||
if (ca.isEmpty() && cb.isEmpty())
|
else
|
||||||
return 1.0;
|
return normalize(ssalgo.score(ca,cb));
|
||||||
else
|
}
|
||||||
return normalize(ssalgo.score(ca,cb));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,6 @@ import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
import eu.dnetlib.pace.model.MapDocumentComparator;
|
||||||
import eu.dnetlib.pace.model.TreeNodeDef;
|
|
||||||
import eu.dnetlib.pace.tree.support.MatchType;
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
@ -45,67 +43,13 @@ public class BlockProcessor {
|
||||||
if (q.size() > 1) {
|
if (q.size() > 1) {
|
||||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||||
//process(q, context);
|
//process(q, context);
|
||||||
|
process(simplifyQueue(q, key, context), context);
|
||||||
|
|
||||||
//process the decision tree if it is specified, otherwise go with conditions and distance algos
|
|
||||||
if (!dedupConf.getPace().getDecisionTree().isEmpty()){
|
|
||||||
processPersons(q, context);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
process(simplifyQueue(q, key, context), context);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void processPersons(final Queue<MapDocument> queue, final Reporter context) {
|
|
||||||
|
|
||||||
while (!queue.isEmpty()) {
|
|
||||||
|
|
||||||
final MapDocument pivot = queue.remove(); //take first element of the queue
|
|
||||||
final String idPivot = pivot.getIdentifier();
|
|
||||||
|
|
||||||
//compare the first element with all the others
|
|
||||||
for (final MapDocument curr : queue) {
|
|
||||||
final String idCurr = curr.getIdentifier();
|
|
||||||
|
|
||||||
//check if pivot and current element are similar by processing the tree
|
|
||||||
if (navigateTree(pivot, curr)!=MatchType.NO_MATCH)
|
|
||||||
writeSimilarity(context, idPivot, idCurr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){
|
|
||||||
|
|
||||||
final Map<String, TreeNodeDef> decisionTree = dedupConf.getPace().getDecisionTree();
|
|
||||||
|
|
||||||
String current = "start";
|
|
||||||
|
|
||||||
while (MatchType.getEnum(current)==MatchType.UNDEFINED) {
|
|
||||||
|
|
||||||
TreeNodeDef currentNode = decisionTree.get(current);
|
|
||||||
//throw an exception if the node doesn't exist
|
|
||||||
if (currentNode == null)
|
|
||||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
|
||||||
|
|
||||||
double similarity = currentNode.evaluate(doc1, doc2);
|
|
||||||
|
|
||||||
if (similarity == -1) {
|
|
||||||
current = currentNode.getUndefined();
|
|
||||||
}
|
|
||||||
else if (similarity>=currentNode.getThreshold()){
|
|
||||||
current = currentNode.getPositive();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
current = currentNode.getNegative();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return MatchType.getEnum(current);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
|
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
|
||||||
final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField()));
|
final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField()));
|
||||||
|
|
||||||
|
|
|
@ -611,7 +611,6 @@ terzo
|
||||||
th
|
th
|
||||||
ti
|
ti
|
||||||
titolo
|
titolo
|
||||||
torino
|
|
||||||
tra
|
tra
|
||||||
tranne
|
tranne
|
||||||
tre
|
tre
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
key::1;university;università;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο
|
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο
|
||||||
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
||||||
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα
|
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα
|
||||||
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
||||||
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
|
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
|
||||||
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα
|
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
|
||||||
key::7;college;collegio;université;colegio;faculdade;Hochschule;Szkoła Wyższa;Высшая школа;universiteit;κολλέγιο
|
key::7;college;collegio;université;colegio;faculdade;Hochschule;Szkoła Wyższa;Высшая школа;universiteit;κολλέγιο
|
||||||
key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα
|
key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα;idryma
|
||||||
key::9;center;centro;centre;centro;centro;zentrum;centrum;центр;centrum;κέντρο
|
key::9;center;centro;centre;centro;centro;zentrum;centrum;центр;centrum;κέντρο
|
||||||
key::10;national;nazionale;national;nationale;nationaux;nationales;nacional;nacional;national;krajowy;национальный;nationaal;nationale;εθνικό
|
key::10;national;nazionale;national;nationale;nationaux;nationales;nacional;nacional;national;krajowy;национальный;nationaal;nationale;εθνικό
|
||||||
key::11;association;associazione;association;asociación;associação;Verein;verband;stowarzyszenie;ассоциация;associatie
|
key::11;association;associazione;association;asociación;associação;Verein;verband;stowarzyszenie;ассоциация;associatie
|
||||||
|
@ -45,3 +45,59 @@ key::44;academic;accademico;académique;universitaire;акадеческий aca
|
||||||
key::45;institution;istituzione;institution;институциональный;instelling;ινστιτούτο
|
key::45;institution;istituzione;institution;институциональный;instelling;ινστιτούτο
|
||||||
key::46;division;divisione;division;отделение;divisie;τμήμα
|
key::46;division;divisione;division;отделение;divisie;τμήμα
|
||||||
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
|
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
|
||||||
|
key::48;promotion;promozione;продвижение;proothisis;forderung
|
||||||
|
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
|
||||||
|
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;
|
||||||
|
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
|
||||||
|
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
|
||||||
|
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
|
||||||
|
key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia;
|
||||||
|
key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik;
|
||||||
|
key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon;
|
||||||
|
key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne;
|
||||||
|
key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
|
||||||
|
key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
|
||||||
|
key::60;pharmaceuticals;pharmacy;farmacia;farmaceutica;farmacéutica;farmacia;farmacêutica;farmácia;Pharmazeutika;Arzneimittelkunde;farmaceutica;geneesmiddelen;apotheek;φαρμακευτικός;φαρμακευτική;φαρμακευτικό;φαρμακευτικά;φαρμακείο;ilaç;eczane;gyógyszerészeti;gyógyszertár;farmacevtika;lekarništvo;farmaatsia;farmatseutiline;
|
||||||
|
key::61;healthcare;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu;
|
||||||
|
key::62;history;storia;historia;história;Geschichte;geschiedenis;geschiedkunde;ιστορία;tarih;történelem;zgodovina;ajalugu;
|
||||||
|
key::63;materials;materiali;materia;materiales;materiais;materialen;υλικά;τεκμήρια;malzemeler;anyagok;materiali;materjalid;vahendid;
|
||||||
|
key::64;economics;economia;economiche;economica;economía;economia;Wirtschaft;economie;οικονομικά;οικονομικέςεπιστήμες;ekonomi;közgazdaságtan;gospodarstvo;ekonomija;majanduslik;majandus;
|
||||||
|
key::65;therapeutics;terapeutica;terapéutica;terapêutica;therapie;θεραπευτική;tedavibilimi;gyógykezelés;terapevtika;terapeutiline;ravi;
|
||||||
|
key::66;oncology;oncologia;oncologico;oncología;oncologia;Onkologie;oncologie;ογκολογία;onkoloji;onkológia;onkologija;onkoloogia;
|
||||||
|
key::67;natural;naturali;naturale;natural;natural;natürlich;natuurlijk;φυσικός;φυσική;φυσικό;φυσικά;doğal;természetes;naraven;loodus;
|
||||||
|
key::68;educational;educazione;pedagogia;educacional;educativo;educacional;pädagogisch;educatief;εκπαιδευτικός;εκπαιδευτική;εκπαιδευτικό;εκπαιδευτικά;eğitimsel;oktatási;izobraževalen;haridus;hariduslik;
|
||||||
|
key::69;biomedical;biomedica;biomédico;biomédico;biomedizinisch;biomedisch;βιοιατρικός;βιοιατρική;βιοιατρικό;βιοιατρικά;biyomedikal;orvosbiológiai;biomedicinski;biomeditsiiniline;
|
||||||
|
key::70;veterinary;veterinaria;veterinarie;veterinaria;veterinária;tierärtzlich;veterinair;veeartsenijlkunde;κτηνιατρικός;κτηνιατρική;κτηνιατρικό;κτηνιατρικά;veteriner;állatorvosi;veterinar;veterinarski;veterinaaria;
|
||||||
|
key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia;
|
||||||
|
key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek;
|
||||||
|
key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia;
|
||||||
|
key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar;
|
||||||
|
key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline;
|
||||||
|
key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika;
|
||||||
|
key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus;
|
||||||
|
key::78;maritime;marittima;marittime;marittimo;marítimo;marítimo;maritiem;ναυτικός;ναυτική;ναυτικό;ναυτικά;ναυτιλιακός;ναυτιλιακή;ναυτιλιακό;ναυτιλιακά;θαλάσσιος;θαλάσσια;θαλάσσιο;denizcilik;tengeri;morski;mere;merendus;
|
||||||
|
key::79;sports;sport;deportes;esportes;Sport;sport;sportwetenschappen;άθληση;γυμναστικήδραστηριότητα;spor;sport;šport;sport;spordi;
|
||||||
|
key::80;surgery;chirurgia;chirurgiche;cirugía;cirurgia;Chirurgie;chirurgie;heelkunde;εγχείρηση;επέμβαση;χειρουργικήεπέμβαση;cerrahi;sebészet;kirurgija;kirurgia;
|
||||||
|
key::81;cultural;culturale;culturali;cultura;cultural;cultural;kulturell;cultureel;πολιτιστικός;πολιτιστική;πολιτιστικό;πολιτισμικός;πολιτισμική;πολιτισμικό;kültürel;kultúrális;kulturni;kultuuri;kultuuriline;
|
||||||
|
key::82;computerscience;informatica;ordenador;computadora;informática;computación;cienciasdelacomputación;ciênciadacomputação;Computer;computer;υπολογιστής;ηλεκτρονικόςυπολογιστής;bilgisayar;számítógép;računalnik;arvuti;
|
||||||
|
key::83;finance;financial;finanza;finanziarie;finanza;financiero;finanças;financeiro;Finanzen;finanziell;financiën;financieel;χρηματοοικονομικά;χρηματοδότηση;finanse;finansal;pénzügy;pénzügyi;finance;finančni;finants;finantsiline;
|
||||||
|
key::84;communication;comunicazione;comuniciación;comunicação;Kommunikation;communication;επικοινωνία;iletişim;kommunikáció;komuniciranje;kommunikatsioon;
|
||||||
|
key::85;justice;giustizia;justicia;justiça;Recht;Justiz;justitie;gerechtigheid;δικαιοσύνη;υπουργείοδικαιοσύνης;δίκαιο;adalet;igazságügy;pravo;õigus;
|
||||||
|
key::86;aerospace;aerospaziale;aerospaziali;aeroespacio;aeroespaço;Luftfahrt;luchtvaart;ruimtevaart;αεροπορικός;αεροπορική;αεροπορικό;αεροναυπηγικός;αεροναυπηγική;αεροναυπηγικό;αεροναυπηγικά;havacılıkveuzay;légtér;zrakoplovstvo;atmosfäär;kosmos;
|
||||||
|
key::87;dermatology;dermatologia;dermatología;dermatologia;Dermatologie;dermatologie;δρματολογία;dermatoloji;bőrgyógyászat;dermatológia;dermatologija;dermatoloogia;
|
||||||
|
key::88;architecture;architettura;arquitectura;arquitetura;Architektur;architectuur;αρχιτεκτονική;mimarlık;építészet;arhitektura;arhitektuur;
|
||||||
|
key::89;mathematics;matematica;matematiche;matemáticas;matemáticas;Mathematik;wiskunde;mathematica;μαθηματικά;matematik;matematika;matematika;matemaatika;
|
||||||
|
key::90;language;lingue;linguistica;linguistiche;lenguaje;idioma;língua;idioma;Sprache;taal;taalkunde;γλώσσα;dil;nyelv;jezik;keel;
|
||||||
|
key::91;neuroscience;neuroscienza;neurociencia;neurociência;Neurowissenschaft;neurowetenschappen;νευροεπιστήμη;nörobilim;idegtudomány;nevroznanost;neuroteadused;
|
||||||
|
key::92;automation;automazione;automatización;automação;Automatisierung;automatisering;αυτοματοποίηση;otomasyon;automatizálás;avtomatizacija;automatiseeritud;
|
||||||
|
key::93;pediatric;pediatria;pediatriche;pediatrico;pediátrico;pediatría;pediátrico;pediatria;pädiatrisch;pediatrische;παιδιατρική;pediatrik;gyermekgyógyászat;pediatrija;pediaatria;
|
||||||
|
key::94;photonics;fotonica;fotoniche;fotónica;fotônica;Photonik;fotonica;φωτονική;fotonik;fotonika;fotonika;fotoonika;
|
||||||
|
key::95;mechanics;meccanica;meccaniche;mecánica;mecânica;Mechanik;Maschinenbau;mechanica;werktuigkunde;μηχανικής;mekanik;gépészet;mehanika;mehaanika;
|
||||||
|
key::96;psychiatrics;psichiatria;psichiatrica;psichiatriche;psiquiatría;psiquiatria;Psychiatrie;psychiatrie;ψυχιατρική;psikiyatrik;pszihiátria;psihiatrija;psühhaatria;
|
||||||
|
key::97;psychology;fisiologia;psicología;psicologia;Psychologie;psychologie;ψυχολογία;psikoloji;pszihológia;psihologija;psühholoogia;
|
||||||
|
key::98;automotive;industriaautomobilistica;industriadelautomóvil;automotriz;industriaautomotriz;automotivo;Automobilindustrie;autoindustrie;αυτοκίνητος;αυτοκίνητη;αυτοκίνητο;αυτοκινούμενος;αυτοκινούμενη;αυτοκινούμενο;αυτοκινητιστικός;αυτοκινητιστική;αυτοκινητιστικό;otomotiv;autóipari;samogiben;avtomobilskaindustrija;auto-;
|
||||||
|
key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neurologie;zenuwleer;νευρολογία;nöroloji;neurológia;ideggyógyászat;nevrologija;neuroloogia;
|
||||||
|
key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia;
|
||||||
|
key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
|
||||||
|
key::102;informatics;informatica;informática;informática;informatica;
|
||||||
|
key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
|
||||||
|
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
public class DedupTestIT {
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -10,6 +10,7 @@ import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static junit.framework.Assert.assertEquals;
|
import static junit.framework.Assert.assertEquals;
|
||||||
|
import static junit.framework.Assert.assertTrue;
|
||||||
|
|
||||||
public class DistanceAlgoTest extends AbstractPaceFunctions {
|
public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
|
@ -61,5 +62,45 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
assertEquals(result, 1.0);
|
assertEquals(result, 1.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJaroWinklerNormalizedName3() {
|
||||||
|
|
||||||
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna");
|
||||||
|
|
||||||
|
System.out.println("result = " + result);
|
||||||
|
assertEquals(result, 0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJaroWinklerNormalizedName4() {
|
||||||
|
|
||||||
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa");
|
||||||
|
|
||||||
|
System.out.println("result = " + result);
|
||||||
|
assertEquals(result, 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJaroWinklerNormalizedName5() {
|
||||||
|
|
||||||
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS");
|
||||||
|
|
||||||
|
System.out.println("result = " + result);
|
||||||
|
assertEquals(result, 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJaroWinklerNormalizedName6() {
|
||||||
|
|
||||||
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung");
|
||||||
|
|
||||||
|
System.out.println("result = " + result);
|
||||||
|
assertTrue(result> 0.9);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,144 +0,0 @@
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.AbstractPaceTest;
|
|
||||||
import eu.dnetlib.pace.config.Type;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldListImpl;
|
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
|
||||||
import org.junit.Before;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import static junit.framework.Assert.assertEquals;
|
|
||||||
import static junit.framework.Assert.assertTrue;
|
|
||||||
|
|
||||||
//test class for comparators (to be used into the tree nodes)
|
|
||||||
public class ComparatorTest extends AbstractPaceTest {
|
|
||||||
|
|
||||||
private Map<String, Number> params;
|
|
||||||
|
|
||||||
@Before
|
|
||||||
public void setup() {
|
|
||||||
params = new HashMap<>();
|
|
||||||
//to put all the needed parameters
|
|
||||||
params.put("minCoauthors", 5);
|
|
||||||
params.put("maxCoauthors", 200);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testCoauthorsMatch() {
|
|
||||||
|
|
||||||
final CoauthorsMatch coauthorsMatch = new CoauthorsMatch(params);
|
|
||||||
|
|
||||||
Field a = createFieldList(Arrays.asList("la bruzzo, sandro", "atzori, claudio", "artini, michele", "de bonis, michele", "bardi, alessia", "dell'amico, andrea", "baglioni, miriam"), "coauthors");
|
|
||||||
Field b = createFieldList(Arrays.asList("la bruzzo, sandro"), "coauthors");
|
|
||||||
|
|
||||||
double result1 = coauthorsMatch.compare(a, b);
|
|
||||||
double result2 = coauthorsMatch.compare(a, a);
|
|
||||||
|
|
||||||
System.out.println("a = " + a);
|
|
||||||
System.out.println("b = " + b);
|
|
||||||
|
|
||||||
System.out.println("a vs b = " + result1);
|
|
||||||
System.out.println("a vs a = " + result2);
|
|
||||||
|
|
||||||
assertEquals(result1, -1.0);
|
|
||||||
assertEquals(result2, 7.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testExactMatch() {
|
|
||||||
|
|
||||||
final ExactMatch exactMatch = new ExactMatch(params);
|
|
||||||
|
|
||||||
Field a = new FieldValueImpl(Type.String, "doi", "10.1000/0000000000");
|
|
||||||
Field b = new FieldValueImpl(Type.String, "doi", "10.1033/0000000000");
|
|
||||||
Field c = new FieldValueImpl(Type.String, "doi", "");
|
|
||||||
|
|
||||||
double result1 = exactMatch.compare(a,a);
|
|
||||||
double result2 = exactMatch.compare(a,b);
|
|
||||||
double result3 = exactMatch.compare(a,c);
|
|
||||||
|
|
||||||
System.out.println("a = " + a);
|
|
||||||
System.out.println("b = " + b);
|
|
||||||
System.out.println("c = " + c);
|
|
||||||
|
|
||||||
System.out.println("a vs a = " + result1);
|
|
||||||
System.out.println("a vs b = " + result2);
|
|
||||||
System.out.println("a vs c = " + result3);
|
|
||||||
|
|
||||||
assertEquals(result1, 1.0);
|
|
||||||
assertEquals(result2, 0.0);
|
|
||||||
assertEquals(result3, -1.0);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSimilarMatch() {
|
|
||||||
|
|
||||||
final SimilarMatch similarMatch = new SimilarMatch(params);
|
|
||||||
|
|
||||||
Field a = new FieldValueImpl(Type.String, "firstname", "sandro");
|
|
||||||
Field b = new FieldValueImpl(Type.String, "firstname", "s.");
|
|
||||||
Field c = new FieldValueImpl(Type.String, "firstname", "stefano");
|
|
||||||
|
|
||||||
double result1 = similarMatch.compare(a,b);
|
|
||||||
double result2 = similarMatch.compare(a,c);
|
|
||||||
double result3 = similarMatch.compare(b,c);
|
|
||||||
|
|
||||||
System.out.println("a = " + a);
|
|
||||||
System.out.println("b = " + b);
|
|
||||||
System.out.println("c = " + c);
|
|
||||||
|
|
||||||
System.out.println("a vs b = " + result1);
|
|
||||||
System.out.println("a vs c = " + result2);
|
|
||||||
System.out.println("b vs c = " + result3);
|
|
||||||
|
|
||||||
assertEquals(result1, 1.0);
|
|
||||||
assertEquals(result3, 1.0);
|
|
||||||
assertTrue(result2<0.7);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTopicsMatch() {
|
|
||||||
|
|
||||||
final TopicsMatch topicsMatch = new TopicsMatch(params);
|
|
||||||
|
|
||||||
Field a = createFieldList(Arrays.asList("0.0", "1.0", "0.0"), "topics");
|
|
||||||
Field b = createFieldList(Arrays.asList("0.0", "0.0", "1.0"), "topics");
|
|
||||||
Field c = createFieldList(Arrays.asList("0.5", "0.5", "0.0"), "topics");
|
|
||||||
|
|
||||||
double result1 = topicsMatch.compare(a,a);
|
|
||||||
double result2 = topicsMatch.compare(a,c);
|
|
||||||
double result3 = topicsMatch.compare(b,c);
|
|
||||||
|
|
||||||
System.out.println("a = " + a);
|
|
||||||
System.out.println("b = " + b);
|
|
||||||
System.out.println("c = " + c);
|
|
||||||
|
|
||||||
System.out.println("a vs a = " + result1);
|
|
||||||
System.out.println("a vs c = " + result2);
|
|
||||||
System.out.println("b vs c = " + result3);
|
|
||||||
|
|
||||||
assertEquals(result1, 1.0);
|
|
||||||
assertEquals(result2, 0.5);
|
|
||||||
assertEquals(result3, 0.0);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testUndefinedNode() {
|
|
||||||
|
|
||||||
final UndefinedNode undefinedNode = new UndefinedNode();
|
|
||||||
double result = undefinedNode.compare(new FieldListImpl(),new FieldListImpl());
|
|
||||||
|
|
||||||
assertEquals(result, 0.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue