master #59

Closed
claudio.atzori wants to merge 3221 commits from master into stable_ids
7 changed files with 73 additions and 14 deletions
Showing only changes of commit f791730330 - Show all commits

View File

@ -0,0 +1,31 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersMatch")
public class NumbersMatch extends AbstractComparator {
public NumbersMatch(Map<String, String> params) {
super(params);
}
@Override
public double distance(String a, String b, Config conf) {
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
if (numbers1.equals(numbers2))
return 1.0;
return 0.0;
}
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
@ -10,13 +11,17 @@ public class FieldStats implements Serializable {
private double weight; //weight for the field (to be used in the aggregation)
private double result; //the result of the comparison
private Field a;
private Field b;
private boolean countIfUndefined;
public FieldStats(double weight, double result, boolean countIfUndefined) {
public FieldStats(double weight, double result, boolean countIfUndefined, Field a, Field b) {
this.weight = weight;
this.result = result;
this.countIfUndefined = countIfUndefined;
this.a = a;
this.b = b;
}
public double getWeight() {
@ -43,6 +48,22 @@ public class FieldStats implements Serializable {
this.countIfUndefined = countIfUndefined;
}
public Field getA() {
return a;
}
public void setA(Field a) {
this.a = a;
}
public Field getB() {
return b;
}
public void setB(Field b) {
this.b = b;
}
@Override
public String toString(){
try {

View File

@ -46,7 +46,7 @@ public class TreeNodeDef implements Serializable {
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined()));
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())));
}

View File

@ -8138,7 +8138,7 @@ city::5393429;Santee;Cowles;Cowles Station;Cowlestown;Fanita;Santee;Santi;Sentij
city::5396003;Simi Valley;Chimii;Simi;Simi Vali;Simi Valley;Simi-Vehlli;Simi-Velli;Simih;Simiopolis;shimibare;simi bhyali;simibaelli;smy wyly;symy faly;symy wly kalyfrnya;xi mi gu;Сими Вали;Сими-Вэлли;Сімі-Веллі;סימי ואלי;سمی ویلی;سيمي فالي;سیمی ولی، کالیفرنیا;सिमि भ्याली;シミバレー;西米谷;시미밸리;
city::5397603;South Gate;Juzhen Gejt;Saut Gejt;South Gate;nan men;sa'utha geta;sausugeito;swt gyt kalyfrnya;swth ghyt;sʼwt gyyt;Саут Гејт;Южен Гейт;סאות גייט;ساؤتھ گیٹ;ساؤتھ گیٹ، کیلیفورنیا;سوت گیت، کالیفرنیا;سوث غيت;साउथ गेट;サウスゲイト;南门;
city::5397765;South San Francisco;Baden;Juzhen San Francisko;Saus-San-Francisko;Saut San Francisko;South San Francisco;nan jiu jin shan;sa'utha syana phransisko;saususanfuranshisuko;swt san fransyskw kalyfrnya;swwth san fransyskw;Саус-Сан-Франциско;Саут Сан Франциско;Южен Сан Франциско;سوت سان فرانسیسکو، کالیفرنیا;سووث سان فرانسيسكو;साउथ स्यान फ्रान्सिस्को;サウスサンフランシスコ;南旧金山;南舊金山;
city::5397841;South Whittier;;
city::5397841;South Whittier;
city::5399020;Stockton;SCK;Stockton;Stokt"n;Stokton;Tuleburg;Weberville;astwktwn;seutogteon;shi de dun;si tuo ke dun;stakatana;stwktwn;stwqtwn;sutokkuton;Стоктон;Стоктън;סטוקטון;استوکتون;ستوكتون;سٹوکٹن;स्टकटन;ストックトン;士德頓;斯托克顿;스톡턴;
city::5400075;Sunnyvale;Encinal;S"nivejl;Sanivejl;Sannivejl;Sunnyvale;saniberu;sanivela;sannibhela;sanyfal;sanywyl kalyfrnya;sen ni wei er;seonibeil;snywyl;Санивејл;Саннивейл;Саннівейл;Сънивейл;סאניווייל;سانيفال;سانی‌ویل، کالیفرنیا;سنیویل;सनीवेल;सन्नीभेल;サニーベール;森尼韦尔;서니베일;
city::5401395;Temecula;Temecula;Temecula Station;Temekjula;Temekula;te man ku la;temakula;temekyura;tmwkwla kalyfrnya;tymykwla;Темекула;Темекјула;تموکولا، کالیفرنیا;تيميكولا;ٹیمکولا;तेमाकुला;テメキュラ;特曼库拉;
@ -8152,7 +8152,7 @@ city::5404555;Union City;Juni"n Siti;Junion Siti;Junion-Siti;Thanh pho Union;Th
city::5404794;Universal City;Juniversal-Siti;Junivursul Siti;huan qiu shi;yunibeoseolsiti;ywnyfrsal syty;ywnywrsal syty kalyfrnya;Юниверсал-Сити;Юнивърсъл Сити;يونيفرسال سيتي;یونیورسال سیتی، کالیفرنیا;環球市;유니버설시티;
city::5404915;Upland;"plend;Apland;CCB;Magnolia Villa;North Ontario;Upland;a pu lan;abland;ap lnd kalyfrnya;apalyanda;eoblaendeu;Апланд;Ъпленд;آبلاند;آپ لند، کالیفرنیا;अपल्यांड;阿普兰;업랜드;
city::5405228;Vacaville;Vacaville;Vakavil;bhakabhila;fakafyly;wa ka wei er;wakawyl kalyfrnya;Βάκαβιλ;Вакавил;فاكافيلي;واکاولے;واکاویل، کالیفرنیا;भाकाभिल;瓦卡维尔;
city::5405288;Valencia;;
city::5405288;Valencia;
city::5405380;Vallejo;VLO;Val'ekho;Val'jo;Valejo;Valekho;Vallejo;balleio;bhaleyo;falyjw;vu~areho;wa lie huo;walhw kalyfrnya;walyjw;Валехо;Валејо;Вальехо;Вальйо;فاليجو;والهو، کالیفرنیا;والیجو;भालेयो;ヴァレーホ;瓦列霍;발레이오;
city::5405693;Van Nuys;Van-Najs;baennaijeu;fan nwys;Ван-Найс;فان نويس;ون نایز، لس‌آنجلس;밴나이즈;
city::5405878;Ventura;La Asumpta;La Asuncionde Nuestra Senora;La Pueblo de los Canoas;Mishkanakan;Mitskanakan;Poinsettia City;Ponsettia City by the Sea;Pueblo De las Canoas;Pueblo de Canoas;S B Ventura;San Buenaventura;The Palm City;Ventura;Ventura-by-the Sea;benchura;bentula;fyntwra;syana byuyanabhencura;wen tu la;wnchra;wntwra kalyfrnya;Вентура;فينتورا;ونتورا، کالیفرنیا;ونچرا;स्यान ब्युयनाभेन्चुरा;ベンチュラ;文图拉;벤투라;
@ -8200,7 +8200,7 @@ city::5512862;Sparks;Harriman;Sparks;asparks nwada;sbarks;seupakeuseu;Спар
city::5512909;Spring Valley;Spring Vali;Spring Valley;Spring-Vehlli;aspryng wly nwada;sbryngh faly;seupeulingbaelli;supuringubare;Спринг Вали;Спринг-Вэлли;اسپرینگ ولی، نوادا;سبرينغ فالي;سپرنگ ویلی، نیواڈا;スプリングバレー;스프링밸리;
city::5513343;Sunrise Manor;Sanrajz Mejnor;Sanrajz-Mehner;Sunrise Manor;sanraizumana;sanrayz mnwr nwada;snrayz manwr;Санрайз-Мэнер;Санрајз Мејнор;سانرایز منور، نوادا;سنرائز مینور، نیواڈا;سنرايز مانور;サンライズマナー;
city::5516233;Amarillo;AMA;Amarilas;Amarillo;Amarilo;Amarilyo;Oneida;a ma li luo;aemeolillo;amariro;amariyo;amarylw;amarylw tgzas;Амарилло;Амарило;Ամարիլլո;אמרילו;آماریلو، تگزاس;آماریلو، ٹیکساس;أماريلو;اماریلو;आमारियो;アマリロ;阿馬里洛;애머릴로;
city::5520993;El Paso;Concordia;ELP;Ehl'-Paso;El Pasas;El Paso;El'-Paso;El-Paso;Elpaso;Franklin;Gorad Ehl'-Pasa;Magoffinsville;Passus;ai er pa suo;al basw;al pasw;ayl pasw;el-paso;ela peso;ela pyaso;elpaeso;Ελ Πάσο;Горад Эль-Паса;Ел Пасо;Ель-Пасо;Эль-Пасо;Էլ Պասո;אל פאסו;إل باسو;ال پاسو;ایل پاسو;एल पॅसो;এল প্যাসো;เอลแพโซ;ელ-პასო;ელ-ფასო;エル・パソ;艾爾帕索;엘패소;
city::5520993;El Paso;ELP;Ehl'-Paso;El Pasas;El Paso;El'-Paso;El-Paso;Elpaso;Gorad Ehl'-Pasa;Magoffinsville;Passus;ai er pa suo;al basw;al pasw;ayl pasw;el-paso;ela peso;ela pyaso;elpaeso;Ελ Πάσο;Горад Эль-Паса;Ел Пасо;Ель-Пасо;Эль-Пасо;Էլ Պասո;אל פאסו;إل باسو;ال پاسو;ایل پاسو;एल पॅसो;এল প্যাসো;เอลแพโซ;ელ-პასო;ელ-ფასო;エル・パソ;艾爾帕索;엘패소;
city::5525577;Lubbock;LBB;Labbok;Labok;Lubbock;Lubuk;la bo ke;labak tgzas;labaka;lbk;leobeog;lwbwk;rabokku;Лаббок;Лабок;Лъбък;לאבק;لاباک، تگزاس;لابوک، ٹیکساس;لبک;لوبوك;लबक;ラボック;拉伯克;러벅;
city::5526337;Midland;MAF;Midland;Midlend;Midway;mdland;mi de lan;middorando;mideullaendeu;mydlnd;mydlnd tgzas;Мидланд;Мидленд;Мідленд;מידלנד;مدلاند;مڈلینڈ، ٹیکساس;میدلند، تگزاس;ミッドランド;米德蘭;미들랜드;
city::5527554;Odessa;Odesa;Odessa;ao de sa;awdsa;awdysa;odessa;Одеса;Одесса;أوديسا;اودسا;اوڈیسا، ٹیکساس;اوڈیسہ، ٹیکساس;オデッサ;敖德薩;

Can't render this file because it is too large.

View File

@ -557,9 +557,7 @@ starete
starà
starò
stata
state
stati
stato
stava
stavamo
stavano

View File

@ -1,11 +1,8 @@
package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.tree.CityMatch;
import eu.dnetlib.pace.tree.ContainsMatch;
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.tree.KeywordMatch;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
@ -52,6 +49,7 @@ public class ComparatorTest extends AbstractPaceFunctions {
//both names with cities (different)
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
//particular cases
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
@ -80,14 +78,17 @@ public class ComparatorTest extends AbstractPaceFunctions {
@Test
public void keywordMatchTest(){
params.put("threshold", "0.4");
params.put("threshold", "0.5");
final KeywordMatch keywordMatch = new KeywordMatch(params);
assertEquals(1.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(0.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
}
@Test
@ -102,5 +103,12 @@ public class ComparatorTest extends AbstractPaceFunctions {
assertEquals(0.0, containsMatch.distance("openorgs", "openorgs", conf));
}
@Test
public void numbersMatchTest(){
final NumbersMatch numbersMatch = new NumbersMatch(params);
assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf));
}
}

View File

@ -136,7 +136,8 @@
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
"key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
"key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"]
}
}
}