diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java new file mode 100644 index 0000000000..c2300d2078 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java @@ -0,0 +1,31 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("numbersMatch") +public class NumbersMatch extends AbstractComparator { + + + public NumbersMatch(Map params) { + super(params); + } + + @Override + public double distance(String a, String b, Config conf) { + + String numbers1 = getNumbers(nfd(a)); + String numbers2 = getNumbers(nfd(b)); + + if (numbers1.isEmpty() || numbers2.isEmpty()) + return -1.0; + + if (numbers1.equals(numbers2)) + return 1.0; + + return 0.0; + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java index 2f1decd32e..9accded3e7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.tree.support; +import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.util.PaceException; import org.codehaus.jackson.map.ObjectMapper; @@ -10,13 +11,17 @@ public class FieldStats implements Serializable { private double weight; //weight for the field (to be used in the aggregation) private double result; //the result of the comparison + private Field a; + private Field b; private boolean countIfUndefined; - public FieldStats(double weight, double result, boolean countIfUndefined) { + public FieldStats(double weight, double result, boolean countIfUndefined, Field a, Field b) { this.weight = weight; this.result = result; this.countIfUndefined = countIfUndefined; + this.a = a; + this.b = b; } public double getWeight() { @@ -43,6 +48,22 @@ public class FieldStats implements Serializable { this.countIfUndefined = countIfUndefined; } + public Field getA() { + return a; + } + + public void setA(Field a) { + this.a = a; + } + + public Field getB() { + return b; + } + + public void setB(Field b) { + this.b = b; + } + @Override public String toString(){ try { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index f3d37c7245..cb3b7b4b09 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -46,7 +46,7 @@ public class TreeNodeDef implements Serializable { double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); - stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined())); + stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()))); } diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv index 936c80adf1..066afa7a4c 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv @@ -8138,7 +8138,7 @@ city::5393429;Santee;Cowles;Cowles Station;Cowlestown;Fanita;Santee;Santi;Sentij city::5396003;Simi Valley;Chimii;Simi;Simi Vali;Simi Valley;Simi-Vehlli;Simi-Velli;Simih;Simiopolis;shimibare;simi bhyali;simibaelli;smy wyly;symy faly;symy wly kalyfrnya;xi mi gu;Сими Вали;Сими-Вэлли;Сімі-Веллі;סימי ואלי;سمی ویلی;سيمي فالي;سیمی ولی، کالیفرنیا;सिमि भ्याली;シミバレー;西米谷;시미밸리; city::5397603;South Gate;Juzhen Gejt;Saut Gejt;South Gate;nan men;sa'utha geta;sausugeito;swt gyt kalyfrnya;swth ghyt;sʼwt gyyt;Саут Гејт;Южен Гейт;סאות גייט;ساؤتھ گیٹ;ساؤتھ گیٹ، کیلیفورنیا;سوت گیت، کالیفرنیا;سوث غيت;साउथ गेट;サウスゲイト;南门; city::5397765;South San Francisco;Baden;Juzhen San Francisko;Saus-San-Francisko;Saut San Francisko;South San Francisco;nan jiu jin shan;sa'utha syana phransisko;saususanfuranshisuko;swt san fransyskw kalyfrnya;swwth san fransyskw;Саус-Сан-Франциско;Саут Сан Франциско;Южен Сан Франциско;سوت سان فرانسیسکو، کالیفرنیا;سووث سان فرانسيسكو;साउथ स्यान फ्रान्सिस्को;サウスサンフランシスコ;南旧金山;南舊金山; -city::5397841;South Whittier;; +city::5397841;South Whittier; city::5399020;Stockton;SCK;Stockton;Stokt"n;Stokton;Tuleburg;Weberville;astwktwn;seutogteon;shi de dun;si tuo ke dun;stakatana;stwktwn;stwqtwn;sutokkuton;Стоктон;Стоктън;סטוקטון;استوکتون;ستوكتون;سٹوکٹن;स्टकटन;ストックトン;士德頓;斯托克顿;스톡턴; city::5400075;Sunnyvale;Encinal;S"nivejl;Sanivejl;Sannivejl;Sunnyvale;saniberu;sanivela;sannibhela;sanyfal;sany‌wyl kalyfrnya;sen ni wei er;seonibeil;snywyl;Санивејл;Саннивейл;Саннівейл;Сънивейл;סאניווייל;سانيفال;سانی‌ویل، کالیفرنیا;سنیویل;सनीवेल;सन्नीभेल;サニーベール;森尼韦尔;서니베일; city::5401395;Temecula;Temecula;Temecula Station;Temekjula;Temekula;te man ku la;temakula;temekyura;tmwkwla kalyfrnya;tymykwla;Темекула;Темекјула;تموکولا، کالیفرنیا;تيميكولا;ٹیمکولا;तेमाकुला;テメキュラ;特曼库拉; @@ -8152,7 +8152,7 @@ city::5404555;Union City;Juni"n Siti;Junion Siti;Junion-Siti;Thanh pho Union;Th city::5404794;Universal City;Juniversal-Siti;Junivursul Siti;huan qiu shi;yunibeoseolsiti;ywnyfrsal syty;ywnywrsal syty kalyfrnya;Юниверсал-Сити;Юнивърсъл Сити;يونيفرسال سيتي;یونیورسال سیتی، کالیفرنیا;環球市;유니버설시티; city::5404915;Upland;"plend;Apland;CCB;Magnolia Villa;North Ontario;Upland;a pu lan;abland;ap lnd kalyfrnya;apalyanda;eoblaendeu;Апланд;Ъпленд;آبلاند;آپ لند، کالیفرنیا;अपल्यांड;阿普兰;업랜드; city::5405228;Vacaville;Vacaville;Vakavil;bhakabhila;fakafyly;wa ka wei er;wakawyl kalyfrnya;Βάκαβιλ;Вакавил;فاكافيلي;واکاولے;واکاویل، کالیفرنیا;भाकाभिल;瓦卡维尔; -city::5405288;Valencia;; +city::5405288;Valencia; city::5405380;Vallejo;VLO;Val'ekho;Val'jo;Valejo;Valekho;Vallejo;balleio;bhaleyo;falyjw;vu~areho;wa lie huo;walhw kalyfrnya;walyjw;Валехо;Валејо;Вальехо;Вальйо;فاليجو;والهو، کالیفرنیا;والیجو;भालेयो;ヴァレーホ;瓦列霍;발레이오; city::5405693;Van Nuys;Van-Najs;baennaijeu;fan nwys;Ван-Найс;فان نويس;ون نایز، لس‌آنجلس;밴나이즈; city::5405878;Ventura;La Asumpta;La Asuncionde Nuestra Senora;La Pueblo de los Canoas;Mishkanakan;Mitskanakan;Poinsettia City;Ponsettia City by the Sea;Pueblo De las Canoas;Pueblo de Canoas;S B Ventura;San Buenaventura;The Palm City;Ventura;Ventura-by-the Sea;benchura;bentula;fyntwra;syana byuyanabhencura;wen tu la;wnchra;wntwra kalyfrnya;Вентура;فينتورا;ونتورا، کالیفرنیا;ونچرا;स्यान ब्युयनाभेन्चुरा;ベンチュラ;文图拉;벤투라; @@ -8200,7 +8200,7 @@ city::5512862;Sparks;Harriman;Sparks;asparks nwada;sbarks;seupakeuseu;Спар city::5512909;Spring Valley;Spring Vali;Spring Valley;Spring-Vehlli;aspryng wly nwada;sbryngh faly;seupeulingbaelli;supuringubare;Спринг Вали;Спринг-Вэлли;اسپرینگ ولی، نوادا;سبرينغ فالي;سپرنگ ویلی، نیواڈا;スプリングバレー;스프링밸리; city::5513343;Sunrise Manor;Sanrajz Mejnor;Sanrajz-Mehner;Sunrise Manor;sanraizumana;sanrayz mnwr nwada;snrayz manwr;Санрайз-Мэнер;Санрајз Мејнор;سانرایز منور، نوادا;سنرائز مینور، نیواڈا;سنرايز مانور;サンライズマナー; city::5516233;Amarillo;AMA;Amarilas;Amarillo;Amarilo;Amarilyo;Oneida;a ma li luo;aemeolillo;amariro;amariyo;amarylw;amarylw tgzas;Амарилло;Амарило;Ամարիլլո;אמרילו;آماریلو، تگزاس;آماریلو، ٹیکساس;أماريلو;اماریلو;आमारियो;アマリロ;阿馬里洛;애머릴로; -city::5520993;El Paso;Concordia;ELP;Ehl'-Paso;El Pasas;El Paso;El'-Paso;El-Paso;Elpaso;Franklin;Gorad Ehl'-Pasa;Magoffinsville;Passus;ai er pa suo;al basw;al pasw;ayl pasw;el-paso;ela peso;ela pyaso;elpaeso;Ελ Πάσο;Горад Эль-Паса;Ел Пасо;Ель-Пасо;Эль-Пасо;Էլ Պասո;אל פאסו;إل باسو;ال پاسو;ایل پاسو;एल पॅसो;এল প্যাসো;เอลแพโซ;ელ-პასო;ელ-ფასო;エル・パソ;艾爾帕索;엘패소; +city::5520993;El Paso;ELP;Ehl'-Paso;El Pasas;El Paso;El'-Paso;El-Paso;Elpaso;Gorad Ehl'-Pasa;Magoffinsville;Passus;ai er pa suo;al basw;al pasw;ayl pasw;el-paso;ela peso;ela pyaso;elpaeso;Ελ Πάσο;Горад Эль-Паса;Ел Пасо;Ель-Пасо;Эль-Пасо;Էլ Պասո;אל פאסו;إل باسو;ال پاسو;ایل پاسو;एल पॅसो;এল প্যাসো;เอลแพโซ;ელ-პასო;ელ-ფასო;エル・パソ;艾爾帕索;엘패소; city::5525577;Lubbock;LBB;Labbok;Labok;Lubbock;Lubuk;la bo ke;labak tgzas;labaka;lbk;leobeog;lwbwk;rabokku;Лаббок;Лабок;Лъбък;לאבק;لاباک، تگزاس;لابوک، ٹیکساس;لبک;لوبوك;लबक;ラボック;拉伯克;러벅; city::5526337;Midland;MAF;Midland;Midlend;Midway;mdland;mi de lan;middorando;mideullaendeu;mydlnd;mydlnd tgzas;Мидланд;Мидленд;Мідленд;מידלנד;مدلاند;مڈلینڈ، ٹیکساس;میدلند، تگزاس;ミッドランド;米德蘭;미들랜드; city::5527554;Odessa;Odesa;Odessa;ao de sa;awdsa;awdysa;odessa;Одеса;Одесса;أوديسا;اودسا;اوڈیسا، ٹیکساس;اوڈیسہ، ٹیکساس;オデッサ;敖德薩; diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt index e6131bca9b..5b1434de31 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt @@ -557,9 +557,7 @@ starete starà starò stata -state stati -stato stava stavamo stavano diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 920d00eeb6..7e69b0e3a7 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -1,11 +1,8 @@ package eu.dnetlib.pace.comparators; import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.tree.CityMatch; -import eu.dnetlib.pace.tree.ContainsMatch; -import eu.dnetlib.pace.tree.JaroWinklerNormalizedName; +import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.tree.KeywordMatch; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; @@ -52,6 +49,7 @@ public class ComparatorTest extends AbstractPaceFunctions { //both names with cities (different) assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf)); + assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf)); //particular cases assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); @@ -80,14 +78,17 @@ public class ComparatorTest extends AbstractPaceFunctions { @Test public void keywordMatchTest(){ - params.put("threshold", "0.4"); + params.put("threshold", "0.5"); final KeywordMatch keywordMatch = new KeywordMatch(params); - assertEquals(1.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); + assertEquals(0.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf)); assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf)); assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); + assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf)); + assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); + } @Test @@ -102,5 +103,12 @@ public class ComparatorTest extends AbstractPaceFunctions { assertEquals(0.0, containsMatch.distance("openorgs", "openorgs", conf)); } + @Test + public void numbersMatchTest(){ + final NumbersMatch numbersMatch = new NumbersMatch(params); + + assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf)); + } + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf index 510de0398f..b2ab6ae92d 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf @@ -136,7 +136,8 @@ "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""], "key::102": ["informatics","informatica","informática","informática","informatica",""], "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], - "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], + "key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"] } } } \ No newline at end of file