From 93b332cbe571778abf3c4b03463bc1d4d9eb2809 Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 25 Sep 2019 09:53:06 +0200 Subject: [PATCH 01/13] translation map updated --- .../src/main/java/eu/dnetlib/SparkLocalTest.java | 10 +++++----- .../eu/dnetlib/pace/organization.to.fix.json | 15 +++------------ .../eu/dnetlib/pace/config/translation_map.csv | 2 +- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java index 9ffbdfa..2a38c6b 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java @@ -88,11 +88,11 @@ public class SparkLocalTest { connectedComponents.foreach(cc -> { System.out.println(cc); }); - connectedComponents.foreach(cc -> { - cc.getDocs().stream().forEach(d -> { - System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname")); - }); - }); +// connectedComponents.foreach(cc -> { +// cc.getDocs().stream().forEach(d -> { +// System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname")); +// }); +// }); //print nondeduped nonDeduplicated.foreach(cc -> { System.out.println(cc); diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json index ece53ae..fc16fc3 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json @@ -1,12 +1,3 @@ -{"dateoftransformation":"2019-07-22","originalId":["corda_______::999895789"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIFI"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI FIRENZE"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::19137683d6d3cd4dda5054af05081b6f"} -{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999895789"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIFI"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI FIRENZE"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::19137683d6d3cd4dda5054af05081b6f"} -{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Firenze_University_Press"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Firenze University Press"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Firenze University Press"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::b29ae16abb2343c6ffc152666b24ea95"} -{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Università degli Studi di Firenze"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::115715507c87ade107909750c44fbee5"} -{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"University of Florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::60e21d5264c51c62f154afa6166ba21b"} -{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"University of Florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::a6d1d3c2eb368cb2ab1ff293c625d90e"} -{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Université de florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::c8b8860f04bf3c755f4632395ea27375"} -{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Universität Florenz"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::ff05feef920762cbef5de7640dcb718e"} -{"originalId":["https://academic.microsoft.com/#/detail/45084792"],"pid":[{"qualifier":{"classid":"urn","classname":"urn","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"http://en.wikipedia.org/wiki/University_of_Florence"},{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"},{"qualifier":{"classid":"mag_id","classname":"Microsoft Academic Graph Identifier","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"https://academic.microsoft.com/#/detail/45084792"}],"collectedfrom":[{"value":"Microsoft Academic Graph","key":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"}],"organization":{"metadata":{"websiteurl":{"value":"http://www.unifi.it/"},"legalname":{"value":"University of Florence"}}},"type":20,"id":"20|microsoft___::adecd59d8ff7f5aaedac013fa0f54ffe"} -{"dateoftransformation":"2018-09-13","originalId":["openaire____::issn20381026::Università degli Studi di Firenze"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-06-02","type":20,"id":"20|openaire____::55a8725b9d9a9a67615018901270de4b"} -{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Università_degli_Studi_di_Firenze"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::4f194641be797be5e5eb11227e962145"} -{"dateoftransformation":"2018-09-13","originalId":["snsf________::Università_degli_Studi_di_Firenze"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-23","type":20,"id":"20|snsf________::4f194641be797be5e5eb11227e962145"} \ No newline at end of file +{"dateoftransformation":"2018-11-20","originalId":["corda_______::999987066"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NATIONAAL LUCHT- EN RUIMTEVAARTLABORATORIUM"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::9cb56cf06fbe3926d0c88ee320908848"} +{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999987066"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NATIONAAL LUCHT- EN RUIMTEVAARTLABORATORIUM"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::9cb56cf06fbe3926d0c88ee320908848"} +{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Netherlands_Aerospace_Centre"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Netherlands Aerospace Centre"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::ce12359dec61a8e00837c3e507918812"} diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index c74b357..8ad1996 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο key::39;bureau;ufficio;bureau;офис;bureau;γραφείο key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία -key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology +key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;universidad politecnica;universitat politecnica;politechnika;politechniki;university technology;university science technology key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί From fda7f1ce930d68d30dbfc9c8e1a4c8c11c376f3e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 25 Sep 2019 10:15:13 +0200 Subject: [PATCH 02/13] updated translation map and some tests --- .../eu/dnetlib/pace/config/translation_map.csv | 2 +- .../eu/dnetlib/pace/distance/DistanceAlgoTest.java | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index 8ad1996..e97fd52 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -1,4 +1,4 @@ -key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti +key::1;university;universita;universita studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index ec55b87..3bf300b 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -67,7 +67,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York"); - assertEquals(result, 1.0); + assertEquals(1.0, result); } @Test @@ -77,7 +77,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna"); System.out.println("result = " + result); - assertEquals(result, 0.0); + assertEquals(0.0, result); } @Test @@ -87,7 +87,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa"); System.out.println("result = " + result); - assertEquals(result, 1.0); + assertEquals(1.0, result); } @Test @@ -97,7 +97,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS"); System.out.println("result = " + result); - assertEquals(result, 1.0); + assertEquals(1.0, result); } @Test @@ -107,7 +107,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung"); System.out.println("result = " + result); - assertTrue(result> 0.9); + assertTrue(result > 0.9); } @@ -118,7 +118,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO"); System.out.println("result = " + result); - assertTrue(result> 0.9); + assertTrue(result > 0.9); } @Test From 259d502d70210a6008ba91b35e71dafafcabe6ff Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 25 Sep 2019 10:39:39 +0200 Subject: [PATCH 03/13] [maven-release-plugin] prepare release dnet-dedup-3.0.14 --- dnet-dedup-test/pom.xml | 2 +- dnet-pace-core/pom.xml | 2 +- pom.xml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index e5d429b..03fb97b 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.14-SNAPSHOT + 3.0.14 ../pom.xml diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 9499750..1f3f233 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.14-SNAPSHOT + 3.0.14 ../pom.xml diff --git a/pom.xml b/pom.xml index 57ca8d6..4d50e34 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.14-SNAPSHOT + 3.0.14 pom @@ -22,7 +22,7 @@ scm:git:https://github.com/dnet-team/dnet-dedup.git - HEAD + dnet-dedup-3.0.14 From 42e3bff05fc3bf5f9c1c7e39aafaf3c608248031 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 25 Sep 2019 10:39:46 +0200 Subject: [PATCH 04/13] [maven-release-plugin] prepare for next development iteration --- dnet-dedup-test/pom.xml | 2 +- dnet-pace-core/pom.xml | 2 +- pom.xml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 03fb97b..b2b0437 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.14 + 3.0.15-SNAPSHOT ../pom.xml diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 1f3f233..34138cc 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.14 + 3.0.15-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 4d50e34..4e9d3fe 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.14 + 3.0.15-SNAPSHOT pom @@ -22,7 +22,7 @@ scm:git:https://github.com/dnet-team/dnet-dedup.git - dnet-dedup-3.0.14 + HEAD From 03c1b334d5d20fc9e1d2812a1262899e97182ac4 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 8 Oct 2019 14:53:52 +0200 Subject: [PATCH 05/13] translation map moved in json configuration, support for synonyms added in the configuration, now the configuration is argument of conditions, distancealgos and clusteringfunctions --- .../resources/eu/dnetlib/pace/org.curr.conf | 106 +++++++++++++++ .../AbstractClusteringFunction.java | 7 +- .../eu/dnetlib/pace/clustering/Acronyms.java | 3 +- .../pace/clustering/ClusteringCombiner.java | 8 +- .../pace/clustering/ClusteringFunction.java | 3 +- .../pace/clustering/ImmutableFieldValue.java | 3 +- .../pace/clustering/KeywordsClustering.java | 9 +- .../pace/clustering/LowercaseClustering.java | 7 +- .../dnetlib/pace/clustering/NgramPairs.java | 3 +- .../eu/dnetlib/pace/clustering/Ngrams.java | 4 +- .../pace/clustering/PersonClustering.java | 3 +- .../dnetlib/pace/clustering/PersonHash.java | 3 +- .../clustering/RandomClusteringFunction.java | 4 +- .../pace/clustering/SortedNgramPairs.java | 3 +- .../clustering/SpaceTrimmingFieldValue.java | 3 +- .../dnetlib/pace/clustering/SuffixPrefix.java | 3 +- .../pace/clustering/UrlClustering.java | 3 +- .../pace/common/AbstractPaceFunctions.java | 15 ++- .../pace/condition/AbstractCondition.java | 9 +- .../pace/condition/AlwaysTrueCondition.java | 4 +- .../dnetlib/pace/condition/ConditionAlgo.java | 4 +- .../eu/dnetlib/pace/condition/ExactMatch.java | 3 +- .../pace/condition/ExactMatchIgnoreCase.java | 3 +- .../pace/condition/MustBeDifferent.java | 4 +- .../eu/dnetlib/pace/condition/PidMatch.java | 4 +- .../eu/dnetlib/pace/condition/SizeMatch.java | 3 +- .../pace/condition/TitleVersionMatch.java | 3 +- .../eu/dnetlib/pace/condition/YearMatch.java | 3 +- .../java/eu/dnetlib/pace/config/Config.java | 2 + .../eu/dnetlib/pace/config/DedupConfig.java | 6 + .../eu/dnetlib/pace/config/PaceConfig.java | 27 +++- .../dnetlib/pace/distance/DistanceAlgo.java | 3 +- .../dnetlib/pace/distance/DistanceScorer.java | 4 +- .../distance/SecondStringDistanceAlgo.java | 13 +- .../pace/distance/algo/AlwaysMatch.java | 3 +- .../pace/distance/algo/ExactMatch.java | 3 +- .../pace/distance/algo/JaroWinkler.java | 3 +- .../algo/JaroWinklerNormalizedName.java | 7 +- .../pace/distance/algo/JaroWinklerTitle.java | 3 +- .../distance/algo/Level2JaroWinklerTitle.java | 3 +- .../pace/distance/algo/LevensteinTitle.java | 3 +- .../algo/LevensteinTitleIgnoreVersion.java | 3 +- .../pace/distance/algo/MustBeDifferent.java | 3 +- .../pace/distance/algo/NullDistanceAlgo.java | 3 +- .../distance/algo/SubStringLevenstein.java | 5 +- .../pace/distance/algo/UrlMatcher.java | 5 +- .../clustering/ClusteringFunctionTest.java | 39 +++--- .../pace/distance/DistanceAlgoTest.java | 25 ++-- .../eu/dnetlib/pace/config/org.curr.conf | 124 ++++++++++++++++-- 49 files changed, 410 insertions(+), 107 deletions(-) diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf index 21fd1e2..dfa22de 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf @@ -33,6 +33,112 @@ ], "blacklists" : { "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""], + "key::102": ["informatics","informatica","informática","informática","informatica",""], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] } } } \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index 1782b87..7fdcce4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -18,15 +19,15 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i this.params = params; } - protected abstract Collection doApply(String s); + protected abstract Collection doApply(Config conf, String s); @Override - public Collection apply(List fields) { + public Collection apply(Config conf, List fields) { return fields.stream().filter(f -> !f.isEmpty()) .map(Field::stringValue) .map(this::normalize) .map(s -> filterAllStopWords(s)) - .map(this::doApply) + .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) .filter(StringUtils::isNotBlank) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java index ee5efc9..d300833 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -6,6 +6,7 @@ import java.util.Set; import java.util.StringTokenizer; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; @ClusteringClass("acronyms") public class Acronyms extends AbstractClusteringFunction { @@ -15,7 +16,7 @@ public class Acronyms extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java index a4b58aa..52859b4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java @@ -13,15 +13,15 @@ import eu.dnetlib.pace.model.Field; public class ClusteringCombiner { public static Collection combine(final Document a, final Config conf) { - return new ClusteringCombiner().doCombine(a, conf.clusterings()); + return new ClusteringCombiner().doCombine(a, conf); } - private Collection doCombine(final Document a, final List defs) { + private Collection doCombine(final Document a, final Config conf) { final Collection res = Sets.newLinkedHashSet(); - for (final ClusteringDef cd : defs) { + for (final ClusteringDef cd : conf.clusterings()) { for (final String fieldName : cd.getFields()) { final Field values = a.values(fieldName); - res.addAll(cd.clusteringFunction().apply((List) values)); + res.addAll(cd.clusteringFunction().apply(conf, (List) values)); } } return res; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index 4fe1b59..0554d27 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -4,11 +4,12 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; public interface ClusteringFunction { - public Collection apply(List fields); + public Collection apply(Config config, List fields); public Map getParams(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java index fab8e98..7f342f6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -5,6 +5,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("immutablefieldvalue") public class ImmutableFieldValue extends AbstractClusteringFunction { @@ -14,7 +15,7 @@ public class ImmutableFieldValue extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); res.add(s); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 1cabecd..1680ab0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -15,10 +16,10 @@ public class KeywordsClustering extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(final Config conf, String s) { //takes city codes and keywords codes without duplicates - Set keywords = getKeywords(s, params.getOrDefault("windowSize", 4)); + Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); Set cities = getCities(s, params.getOrDefault("windowSize", 4)); //list of combination to return as result @@ -37,13 +38,13 @@ public class KeywordsClustering extends AbstractClusteringFunction { } @Override - public Collection apply(List fields) { + public Collection apply(final Config conf, List fields) { return fields.stream().filter(f -> !f.isEmpty()) .map(Field::stringValue) .map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here? .map(this::normalize) .map(s -> filterAllStopWords(s)) - .map(this::doApply) + .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) .filter(StringUtils::isNotBlank) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index 5ec8590..6fe525f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -6,6 +6,7 @@ import java.util.Map; import com.google.common.collect.Lists; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -17,16 +18,16 @@ public class LowercaseClustering extends AbstractClusteringFunction { } @Override - public Collection apply(List fields) { + public Collection apply(Config conf, List fields) { Collection c = Sets.newLinkedHashSet(); for(Field f : fields) { - c.addAll(doApply(f.stringValue())); + c.addAll(doApply(conf, f.stringValue())); } return c; } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { if(StringUtils.isBlank(s)) { return Lists.newArrayList(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index 06885be..baa30d7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("ngrampairs") public class NgramPairs extends Ngrams { @@ -15,7 +16,7 @@ public class NgramPairs extends Ngrams { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index 8549468..214b145 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -1,5 +1,7 @@ package eu.dnetlib.pace.clustering; +import eu.dnetlib.pace.config.Config; + import java.util.*; @ClusteringClass("ngrams") @@ -10,7 +12,7 @@ public class Ngrams extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index 718b88d..26b07f0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.clustering; import com.google.common.collect.Sets; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Person; import org.apache.commons.lang.StringUtils; @@ -23,7 +24,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin } @Override - public Collection apply(final List fields) { + public Collection apply(final Config conf, final List fields) { final Set hashes = Sets.newHashSet(); for (final Field f : fields) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index fcb01b9..2020a66 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -6,6 +6,7 @@ import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Person; @ClusteringClass("personhash") @@ -18,7 +19,7 @@ public class PersonHash extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index f012aac..c485fcb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -1,5 +1,7 @@ package eu.dnetlib.pace.clustering; +import eu.dnetlib.pace.config.Config; + import java.util.Collection; import java.util.Map; @@ -10,7 +12,7 @@ public class RandomClusteringFunction extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(final Config conf, String s) { // TODO Auto-generated method stub return null; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index 2f475fe..55b203d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -5,6 +5,7 @@ import java.util.*; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("sortedngrampairs") public class SortedNgramPairs extends NgramPairs { @@ -14,7 +15,7 @@ public class SortedNgramPairs extends NgramPairs { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { final List tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s)); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java index 22dc490..fd8e7a3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -4,6 +4,7 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import eu.dnetlib.pace.config.Config; import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.StringUtils; @@ -17,7 +18,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java index 3960331..fa1f643 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -5,6 +5,7 @@ import java.util.Map; import java.util.Set; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; @ClusteringClass("suffixprefix") public class SuffixPrefix extends AbstractClusteringFunction { @@ -14,7 +15,7 @@ public class SuffixPrefix extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return suffixPrefix(s, param("len"), param("max")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 9955d5f..feb60a2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import java.net.MalformedURLException; @@ -21,7 +22,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu } @Override - public Collection apply(List fields) { + public Collection apply(final Config conf, List fields) { try { return fields.stream() .filter(f -> !f.isEmpty()) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 24379c6..3050293 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -13,6 +13,8 @@ import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; +import java.io.IOException; +import java.io.StringWriter; import java.text.Normalizer; import java.util.*; import java.util.regex.Matcher; @@ -327,12 +329,17 @@ public abstract class AbstractPaceFunctions { return codes; } - public Set getKeywords(String s1, int windowSize) { - return getKeywords(s1, translationMap, windowSize); - } - public Set getCities(String s1, int windowSize) { return getKeywords(s1, cityMap, windowSize); } + public static String readFromClasspath(final String filename, final Class clazz) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(clazz.getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java index 2b4aa29..8f6bf3e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition; import java.util.List; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.model.Document; @@ -25,10 +26,10 @@ public abstract class AbstractCondition extends AbstractPaceFunctions implements this.fields = fields; } - protected abstract ConditionEval verify(FieldDef fd, Field a, Field b); + protected abstract ConditionEval verify(FieldDef fd, Field a, Field b, Config conf); @Override - public ConditionEvalMap verify(final Document a, final Document b) { + public ConditionEvalMap verify(final Document a, final Document b, final Config conf) { final ConditionEvalMap res = new ConditionEvalMap(); for (final FieldDef fd : getFields()) { @@ -36,12 +37,12 @@ public abstract class AbstractCondition extends AbstractPaceFunctions implements final Field vb = b.values(fd.getName()); if (fd.isIgnoreMissing()) { - res.put(fd.getName(), verify(fd, va, vb)); + res.put(fd.getName(), verify(fd, va, vb, conf)); } else { if (va.isEmpty() || vb.isEmpty()) { res.put(fd.getName(), new ConditionEval(cond, va, vb, -1)); } else { - res.put(fd.getName(), verify(fd, va, vb)); + res.put(fd.getName(), verify(fd, va, vb, conf)); } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java index 2274da5..633ade3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java @@ -1,6 +1,8 @@ package eu.dnetlib.pace.condition; import java.util.List; + +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -18,7 +20,7 @@ public class AlwaysTrueCondition extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { return new ConditionEval(cond, a, b, 1); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java index 787ad9a..34e6de9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java @@ -1,6 +1,8 @@ package eu.dnetlib.pace.condition; import java.util.List; + +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.FieldDef; @@ -22,6 +24,6 @@ public interface ConditionAlgo { * @return 0 when condition cannot be verified (ignoremissing = true). Positive int when the condition is verified. Negative int when * the condition is not verified. */ - public abstract ConditionEvalMap verify(Document a, Document b); + public abstract ConditionEvalMap verify(Document a, Document b, Config conf); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java index a4cd847..755e815 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition; import java.util.List; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -20,7 +21,7 @@ public class ExactMatch extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { final String fa = getValue(a); final String fb = getValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java index e9925ec..672980c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition; import java.util.List; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -20,7 +21,7 @@ public class ExactMatchIgnoreCase extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { final String fa = getValue(a); final String fb = getValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java index f2b3bdb..630e234 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace.condition; import java.util.List; import com.google.common.collect.Iterables; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -30,7 +31,8 @@ public class MustBeDifferent extends AbstractCondition { * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List) */ @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) + { final String fa = getValue(a); final String fb = getValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java index c15729e..92378f3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java @@ -6,6 +6,7 @@ import java.util.Set; import java.util.stream.Collectors; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -29,7 +30,8 @@ public class PidMatch extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf + ) { final List sa = ((FieldList) a).stringList(); final List sb = ((FieldList) b).stringList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java index afd0a8e..6343201 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java @@ -4,6 +4,7 @@ import java.util.List; import com.google.common.collect.Iterables; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -32,7 +33,7 @@ public class SizeMatch extends AbstractCondition { * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List) */ @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { // if (a.isEmpty() & b.isEmpty()) return 1; // diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java index 4b94a04..844cbf8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition; import java.util.List; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -20,7 +21,7 @@ public class TitleVersionMatch extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { final String valueA = getFirstValue(a); final String valueB = getFirstValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java index 71bb6cf..af8635c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace.condition; import java.time.Year; import java.util.List; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import org.apache.commons.lang.StringUtils; @@ -34,7 +35,7 @@ public class YearMatch extends AbstractCondition { // } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { final String valueA = getNumbers(getFirstValue(a)); final String valueB = getNumbers(getFirstValue(b)); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 7498c23..3ff299e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -56,4 +56,6 @@ public interface Config { */ public Map> blacklists(); + + public Map translationMap(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index 1cfcb08..f252414 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -55,6 +55,7 @@ public class DedupConfig implements Config, Serializable { try { config = new ObjectMapper().readValue(json, DedupConfig.class); config.getPace().initModel(); + config.getPace().initTranslationMap(); return config; } catch (IOException e) { throw new PaceException("Error in parsing configuration json", e); @@ -144,4 +145,9 @@ public class DedupConfig implements Config, Serializable { return getPace().getBlacklists(); } + @Override + public Map translationMap() { + return getPace().translationMap(); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 4fde1de..490fbaf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -22,6 +22,10 @@ public class PaceConfig implements Serializable { private List conditions; private List clustering; private Map> blacklists; + private Map> synonyms; + + @JsonIgnore + private Map translationMap; @JsonIgnore private Map modelMap; @@ -33,11 +37,24 @@ public class PaceConfig implements Serializable { public void initModel() { modelMap = Maps.newHashMap(); - for(FieldDef fd : getModel()) { + for (FieldDef fd : getModel()) { modelMap.put(fd.getName(), fd); } } + public void initTranslationMap(){ + translationMap = Maps.newHashMap(); + for (String key : synonyms.keySet()) { + for (String term : synonyms.get(key)){ + translationMap.put(term.toLowerCase(), key); + } + } + } + + public Map translationMap(){ + return translationMap; + } + public List getModel() { return model; } @@ -88,6 +105,14 @@ public class PaceConfig implements Serializable { this.blacklists = blacklists; } + public Map> getSynonyms() { + return synonyms; + } + + public void setSynonyms(Map> synonyms) { + this.synonyms = synonyms; + } + public Map getModelMap() { return modelMap; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java index 5e4f69f..8d2b9bd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import java.util.Map; @@ -10,7 +11,7 @@ import java.util.Map; */ public interface DistanceAlgo { - public abstract double distance(Field a, Field b); + public abstract double distance(Field a, Field b, Config conf); public double getWeight(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java index bb3c37e..2f10aca 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java @@ -49,7 +49,7 @@ public class DistanceScorer { final ConditionEvalMap res = new ConditionEvalMap(); for (final ConditionAlgo cd : conditions) { - final ConditionEvalMap map = cd.verify(a, b); + final ConditionEvalMap map = cd.verify(a, b, config); res.mergeFrom(map); // commented out shortcuts @@ -82,7 +82,7 @@ public class DistanceScorer { } } else { if (va.getType().equals(vb.getType())) { - de.setDistance(w * fd.distanceAlgo().distance(va, vb)); + de.setDistance(w * fd.distanceAlgo().distance(va, vb, config)); } else { throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType())); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java index 9cc3529..b710ccf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java @@ -7,6 +7,7 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; @@ -69,7 +70,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp * the b * @return the double */ - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { double score = ssalgo.score(a, b); return normalize(score); } @@ -83,8 +84,8 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp * the b * @return the double */ - protected double distance(final List a, final List b) { - return distance(concat(a), concat(b)); + protected double distance(final List a, final List b, final Config conf) { + return distance(concat(a), concat(b), conf); } /* @@ -93,9 +94,9 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp * @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override - public double distance(final Field a, final Field b) { - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue()); - if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b)); + public double distance(final Field a, final Field b, final Config conf) { + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf); throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java index 503235c..bab477e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -22,7 +23,7 @@ public class AlwaysMatch extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { return 1.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java index 44d881e..66ff3c5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -22,7 +23,7 @@ public class ExactMatch extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { return a.equals(b) ? 1.0 : 0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java index 20c0912..9a89fe9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -24,7 +25,7 @@ public class JaroWinkler extends SecondStringDistanceAlgo { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java index 546629b..64ab74c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -27,7 +28,7 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); @@ -37,8 +38,8 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue()); - Set keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue()); + Set keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); + Set keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java index ff4d6de..134f972 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -23,7 +24,7 @@ public class JaroWinklerTitle extends SecondStringDistanceAlgo { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java index 2d05a00..7b002ae 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -22,7 +23,7 @@ public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { final String ca = cleanup(a); final String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java index 503dc33..f43d319 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.DistanceScorer; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -27,7 +28,7 @@ public class LevensteinTitle extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { final String ca = cleanup(a); final String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java index ff8b34b..956538e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -25,7 +26,7 @@ public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java index e794f02..a1f555f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -22,7 +23,7 @@ public class MustBeDifferent extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { return !a.equals(b) ? 1.0 : 0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java index 8afc45f..16dca1c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; @@ -17,7 +18,7 @@ public class NullDistanceAlgo implements DistanceAlgo { } @Override - public double distance(Field a, Field b) { + public double distance(Field a, Field b, final Config conf) { return 0.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java index 8f0c024..e2c0007 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import org.apache.commons.lang.StringUtils; @@ -69,9 +70,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override - public double distance(final Field a, final Field b) { + public double distance(final Field a, final Field b, final Config conf) { if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) - return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); + return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf); throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java index eacfdc0..5fd05fe 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -28,7 +29,7 @@ public class UrlMatcher extends Levenstein { } @Override - public double distance(Field a, Field b) { + public double distance(Field a, Field b, final Config conf) { final URL urlA = asUrl(getFirstValue(a)); final URL urlB = asUrl(getFirstValue(b)); @@ -44,7 +45,7 @@ public class UrlMatcher extends Levenstein { return hostW * 0.5; } - return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath()); + return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf); } private URL asUrl(final String value) { diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 84ec090..a718941 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -1,22 +1,25 @@ package eu.dnetlib.pace.clustering; -import java.util.Map; - import com.google.common.collect.Lists; import com.google.common.collect.Maps; import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.distance.DistanceAlgoTest; import org.junit.Before; import org.junit.Test; +import java.util.Map; + public class ClusteringFunctionTest extends AbstractPaceTest { private Map params; + DedupConfig conf; @Before public void setUp() throws Exception { params = Maps.newHashMap(); + conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class)); } @Test @@ -26,7 +29,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "http://www.test.it/path/to/resource"; System.out.println(s); - System.out.println(urlClustering.apply(Lists.newArrayList(url(s)))); + System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s)))); } @Test @@ -40,7 +43,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(ngram.apply(Lists.newArrayList(title(s)))); + System.out.println(ngram.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -52,7 +55,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(np.apply(Lists.newArrayList(title(s)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -64,11 +67,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s1 = "University of Pisa"; System.out.println(s1); - System.out.println(np.apply(Lists.newArrayList(title(s1)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s1)))); final String s2 = "Pisa University"; System.out.println(s2); - System.out.println(np.apply(Lists.newArrayList(title(s2)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s2)))); } @Test @@ -81,7 +84,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(acro.apply(Lists.newArrayList(title(s)))); + System.out.println(acro.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -93,7 +96,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -105,7 +108,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -114,7 +117,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = readFromClasspath("gt.author.json"); System.out.println(s); - System.out.println(cf.apply(Lists.newArrayList(person(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(person(s)))); } @Test @@ -123,27 +126,27 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final ClusteringFunction cf = new KeywordsClustering(params); final String s = "Polytechnic University of Turin"; System.out.println(s); - System.out.println(cf.apply(Lists.newArrayList(title(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); final String s1 = "POLITECNICO DI TORINO"; System.out.println(s1); - System.out.println(cf.apply(Lists.newArrayList(title(s1)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); final String s2 = "Universita farmaceutica culturale di milano bergamo"; System.out.println("s2 = " + s2); - System.out.println(cf.apply(Lists.newArrayList(title(s2)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s2)))); final String s3 = "universita universita milano milano"; System.out.println("s3 = " + s3); - System.out.println(cf.apply(Lists.newArrayList(title(s3)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s3)))); final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)"; System.out.println("s4 = " + s4); - System.out.println(cf.apply(Lists.newArrayList(title(s4)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s4)))); final String s5 = "İstanbul Ticarət Universiteti"; System.out.println("s5 = " + s5); - System.out.println(cf.apply(Lists.newArrayList(title(s5)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s5)))); } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 3bf300b..97773f1 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance; import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName; import org.junit.Before; import org.junit.Test; @@ -17,13 +18,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; private Map params; + private DedupConfig conf; @Before public void setup() { - System.out.println("****************************************************************"); - System.out.println("Test String : " + TEST_STRING); params = new HashMap<>(); params.put("weight", 1.0); + conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class)); } @Test @@ -55,7 +56,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { @Test public void testJaroWinklerNormalizedName() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State"); + double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf); System.out.println("result = " + result); assertEquals(0.0, result); @@ -65,7 +66,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName2() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York"); + double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf); assertEquals(1.0, result); } @@ -74,7 +75,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName3() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna"); + double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf); System.out.println("result = " + result); assertEquals(0.0, result); @@ -84,7 +85,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName4() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa"); + double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf); System.out.println("result = " + result); assertEquals(1.0, result); @@ -94,7 +95,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName5() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS"); + double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf); System.out.println("result = " + result); assertEquals(1.0, result); @@ -104,7 +105,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName6() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung"); + double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf); System.out.println("result = " + result); assertTrue(result > 0.9); @@ -115,7 +116,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName7() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO"); + double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf); System.out.println("result = " + result); assertTrue(result > 0.9); @@ -125,7 +126,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName8() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology"); + double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf); System.out.println("result = " + result); } @@ -134,7 +135,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName9() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti"); + double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf); System.out.println("result = " + result); } @@ -144,7 +145,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence"); + double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf); System.out.println("result = " + result); } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf index fd4fbbe..0293680 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf @@ -5,7 +5,7 @@ "entityType" : "organization", "orderField" : "legalname", "queueMaxSize" : "2000", - "groupMaxSize" : "10", + "groupMaxSize" : "50", "slidingWindowSize" : "200", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true" @@ -14,23 +14,131 @@ "clustering" : [ { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } ], "strictConditions" : [ { "name" : "exactMatch", "fields" : [ "gridid" ] } ], "conditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] }, - { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, + { "name" : "exactMatch", "fields" : [ "country" ] } ], "model" : [ - { "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } ], - "blacklists" : { } + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"], + "key::102": ["informatics","informatica","informática","informática","informatica"], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] + } } } \ No newline at end of file From 7998f37ce180fd172419070f2d8f768ed748fa04 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 8 Oct 2019 15:13:45 +0200 Subject: [PATCH 06/13] normalization of the term in the translation map added --- .../main/java/eu/dnetlib/pace/config/PaceConfig.java | 5 ++++- .../test/java/eu/dnetlib/pace/config/ConfigTest.java | 10 +++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 490fbaf..939f6c0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -11,6 +11,7 @@ import org.apache.commons.collections.CollectionUtils; import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; +import java.text.Normalizer; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -46,7 +47,9 @@ public class PaceConfig implements Serializable { translationMap = Maps.newHashMap(); for (String key : synonyms.keySet()) { for (String term : synonyms.get(key)){ - translationMap.put(term.toLowerCase(), key); + translationMap.put( + Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD), + key); } } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 575b3c7..fd16f73 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -10,7 +10,7 @@ public class ConfigTest extends AbstractPaceTest { @Test public void dedupConfigSerializationTest() { - final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json")); + final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf")); final String conf = cfgFromClasspath.toString(); @@ -37,4 +37,12 @@ public class ConfigTest extends AbstractPaceTest { System.out.println(load.toString()); } + @Test + public void translationMapTest() { + + DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf")); + + System.out.println("translationMap = " + load.getPace().translationMap().toString()); + } + } From 1cbb48f77b6c8e8fed08523a21fceec6a4de4fff Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 8 Oct 2019 16:49:07 +0200 Subject: [PATCH 07/13] minor changes --- .../dnetlib/pace/clustering/KeywordsClustering.java | 2 +- .../dnetlib/pace/common/AbstractPaceFunctions.java | 12 ++++-------- .../main/java/eu/dnetlib/pace/config/PaceConfig.java | 1 + .../distance/algo/JaroWinklerNormalizedName.java | 2 +- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 1680ab0..769ecf5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -25,7 +25,7 @@ public class KeywordsClustering extends AbstractClusteringFunction { //list of combination to return as result final Collection combinations = new LinkedHashSet(); - for (String keyword: keywordsToCodes(keywords)){ + for (String keyword: keywordsToCodes(keywords, conf.translationMap())){ for (String city: citiesToCodes(cities)) { combinations.add(keyword+"-"+city); if (combinations.size()>=params.getOrDefault("max", 2)) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 3050293..23ff7ac 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -17,10 +17,7 @@ import java.io.IOException; import java.io.StringWriter; import java.text.Normalizer; import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; -import java.util.stream.Stream; /** * Set of common functions @@ -30,7 +27,6 @@ import java.util.stream.Stream; */ public abstract class AbstractPaceFunctions { - private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv"); private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); @@ -243,10 +239,10 @@ public abstract class AbstractPaceFunctions { } - public double keywordsCompare(Set s1, Set s2){ + public double keywordsCompare(Set s1, Set s2, Map translationMap){ - Set k1 = keywordsToCodes(s1); - Set k2 = keywordsToCodes(s2); + Set k1 = keywordsToCodes(s1, translationMap); + Set k2 = keywordsToCodes(s2, translationMap); int longer = (k1.size()>k2.size())?k1.size():k2.size(); @@ -278,7 +274,7 @@ public abstract class AbstractPaceFunctions { return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); } - public Set keywordsToCodes(Set keywords) { + public Set keywordsToCodes(Set keywords, Map translationMap) { return toCodes(keywords, translationMap); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 939f6c0..56995bb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.config; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.CondDef; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java index 64ab74c..889ebab 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java @@ -46,7 +46,7 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { if (sameCity(cities1,cities2)) { - if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { + if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) { ca = removeKeywords(ca, keywords1); ca = removeKeywords(ca, cities1); From 2ffaa235a2a29dd8c4843c21aa94af38155c7eb5 Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 23 Oct 2019 16:31:45 +0200 Subject: [PATCH 08/13] minor changes and configuration updates (synonym field added) --- dnet-dedup-test/pom.xml | 2 +- .../eu/dnetlib/pace/result.simple.pace.conf | 3 +- .../java/eu/dnetlib/pace/DedupTestIT.java | 2 + .../eu/dnetlib/pace/organization.test.conf | 131 +++++++++++++++- .../eu/dnetlib/pace/result.authors.pace.conf | 3 +- .../eu/dnetlib/pace/result.full.pace.conf | 4 +- .../eu/dnetlib/pace/result.pace.conf | 3 +- .../eu/dnetlib/pace/result.simple.pace.conf | 3 +- dnet-pace-core/pom.xml | 2 +- .../eu/dnetlib/pace/config/ConfigTest.java | 9 ++ .../eu/dnetlib/pace/config/org.test.conf | 144 ++++++++++++++++++ pom.xml | 4 +- release.properties | 27 +++- 13 files changed, 313 insertions(+), 24 deletions(-) create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index b2b0437..9fb70b0 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 ../pom.xml diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf index 3f13651..6bf8785 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf @@ -15,7 +15,8 @@ "model" : [ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" } ], - "blacklists" : { } + "blacklists" : { }, + "synonyms" : { } } } \ No newline at end of file diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java index 702b4ab..a23d6dd 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace; import org.apache.oozie.client.OozieClient; import org.apache.oozie.client.OozieClientException; import org.apache.oozie.client.WorkflowJob; +import org.junit.Ignore; import org.junit.Test; import java.io.IOException; @@ -12,6 +13,7 @@ import static junit.framework.Assert.assertEquals; public class DedupTestIT { + @Ignore @Test public void deduplicationTest() throws OozieClientException, InterruptedException { diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf index 3137253..0293680 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf @@ -1,27 +1,144 @@ { "wf" : { - "threshold" : "0.98", + "threshold" : "0.9", "dedupRun" : "001", "entityType" : "organization", "orderField" : "legalname", "queueMaxSize" : "2000", - "groupMaxSize" : "10", + "groupMaxSize" : "50", "slidingWindowSize" : "200", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true" }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } } + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "gridid" ] } ], "conditions" : [ + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, { "name" : "exactMatch", "fields" : [ "country" ] } ], "model" : [ - { "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" } + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } ], - "blacklists" : { } + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"], + "key::102": ["informatics","informatica","informática","informática","informatica"], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] + } } } \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf index a7c837c..6716770 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf @@ -19,7 +19,8 @@ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, { "name" : "authors", "algo" : "SortedLevel2JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" } ], - "blacklists" : { } + "blacklists" : { }, + "synonyms" : { } } } diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf index e9b384d..da67c04 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf @@ -45,7 +45,9 @@ "^(WHP Cruise Summary Information of section).*$", "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", "^(Measurement of the spin\\-dependent structure function).*" - ] } + ] }, + "synonyms" : { + } } } diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf index 2225bce..3e361b6 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf @@ -23,7 +23,8 @@ { "name" : "title", "algo" : "Level2Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ], - "blacklists" : { } + "blacklists" : { }, + "synonyms" : { } } } diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf index 5fa9b84..b438ab9 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf @@ -15,7 +15,8 @@ "model" : [ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" } ], - "blacklists" : { } + "blacklists" : { } , + "synonyms" : { } } } \ No newline at end of file diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 34138cc..4f8aa1d 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 ../pom.xml diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index fd16f73..9051049 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -5,6 +5,7 @@ import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; public class ConfigTest extends AbstractPaceTest { @@ -45,4 +46,12 @@ public class ConfigTest extends AbstractPaceTest { System.out.println("translationMap = " + load.getPace().translationMap().toString()); } + @Test + public void emptyTranslationMapTest() { + + DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf")); + + assertEquals(0, load.getPace().translationMap().keySet().size()); + } + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf new file mode 100644 index 0000000..0293680 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf @@ -0,0 +1,144 @@ +{ + "wf" : { + "threshold" : "0.9", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "50", + "slidingWindowSize" : "200", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "gridid" ] } + ], + "conditions" : [ + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, + { "name" : "exactMatch", "fields" : [ "country" ] } + ], + "model" : [ + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + ], + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"], + "key::102": ["informatics","informatica","informática","informática","informatica"], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] + } + } +} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 4e9d3fe..9442f1f 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 pom @@ -22,7 +22,7 @@ scm:git:https://github.com/dnet-team/dnet-dedup.git - HEAD + dnet-dedup-3.0.15 diff --git a/release.properties b/release.properties index 5c101a5..9028856 100644 --- a/release.properties +++ b/release.properties @@ -1,11 +1,22 @@ #release configuration -#Mon Jul 08 10:03:15 CEST 2019 -scm.tagNameFormat=@{project.artifactId}-@{project.version} -pushChanges=true -scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git -preparationGoals=clean verify -projectVersionPolicyId=default -remoteTagging=true +#Wed Oct 23 16:22:47 CEST 2019 scm.commentPrefix=[maven-release-plugin] +pushChanges=true +project.rel.eu.dnetlib\:dnet-dedup-test=3.0.15 +scm.tag=dnet-dedup-3.0.15 +remoteTagging=true +project.scm.eu.dnetlib\:dnet-dedup-test.empty=true +projectVersionPolicyId=default +scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git +scm.tagNameFormat=@{project.artifactId}-@{project.version} +project.rel.eu.dnetlib\:dnet-dedup=3.0.15 +project.dev.eu.dnetlib\:dnet-pace-core=3.0.16-SNAPSHOT +preparationGoals=clean verify +project.scm.eu.dnetlib\:dnet-dedup.tag=HEAD +project.scm.eu.dnetlib\:dnet-dedup.developerConnection=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git exec.snapshotReleasePluginAllowed=false -completedPhase=create-backup-poms +project.dev.eu.dnetlib\:dnet-dedup=3.0.16-SNAPSHOT +project.scm.eu.dnetlib\:dnet-pace-core.empty=true +project.dev.eu.dnetlib\:dnet-dedup-test=3.0.16-SNAPSHOT +completedPhase=generate-release-poms +project.rel.eu.dnetlib\:dnet-pace-core=3.0.15 From 4874038f8ecdc367973638650c0dda4f1307a60b Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 23 Oct 2019 16:37:20 +0200 Subject: [PATCH 09/13] minor changes --- dnet-dedup-test/dependency-reduced-pom.xml | 2 +- .../eu/dnetlib/pace/config/org.test.conf | 104 ------------------ release.properties | 4 +- 3 files changed, 3 insertions(+), 107 deletions(-) diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml index 07b9268..a424d01 100644 --- a/dnet-dedup-test/dependency-reduced-pom.xml +++ b/dnet-dedup-test/dependency-reduced-pom.xml @@ -3,7 +3,7 @@ dnet-dedup eu.dnetlib - 3.0.14-SNAPSHOT + 3.0.15 4.0.0 dnet-dedup-test diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf index 0293680..3af6585 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf @@ -35,110 +35,6 @@ "legalname" : [] }, "synonyms": { - "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], - "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], - "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], - "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], - "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], - "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], - "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], - "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], - "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], - "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], - "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], - "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], - "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], - "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], - "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], - "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], - "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], - "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], - "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], - "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], - "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], - "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], - "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], - "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], - "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], - "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], - "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], - "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], - "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], - "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], - "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], - "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], - "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], - "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], - "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], - "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], - "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], - "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], - "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], - "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], - "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], - "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], - "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], - "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], - "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], - "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], - "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], - "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], - "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], - "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], - "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"], - "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"], - "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"], - "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"], - "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"], - "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"], - "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"], - "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"], - "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"], - "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"], - "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"], - "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"], - "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"], - "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"], - "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"], - "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"], - "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"], - "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"], - "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"], - "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"], - "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"], - "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"], - "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"], - "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"], - "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"], - "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"], - "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"], - "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"], - "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"], - "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"], - "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"], - "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"], - "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"], - "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"], - "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"], - "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"], - "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"], - "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"], - "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"], - "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"], - "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"], - "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"], - "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"], - "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"], - "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"], - "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"], - "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"], - "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"], - "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"], - "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"], - "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"], - "key::102": ["informatics","informatica","informática","informática","informatica"], - "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], - "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] } } } \ No newline at end of file diff --git a/release.properties b/release.properties index 9028856..907a9fc 100644 --- a/release.properties +++ b/release.properties @@ -1,5 +1,5 @@ #release configuration -#Wed Oct 23 16:22:47 CEST 2019 +#Wed Oct 23 16:34:27 CEST 2019 scm.commentPrefix=[maven-release-plugin] pushChanges=true project.rel.eu.dnetlib\:dnet-dedup-test=3.0.15 @@ -18,5 +18,5 @@ exec.snapshotReleasePluginAllowed=false project.dev.eu.dnetlib\:dnet-dedup=3.0.16-SNAPSHOT project.scm.eu.dnetlib\:dnet-pace-core.empty=true project.dev.eu.dnetlib\:dnet-dedup-test=3.0.16-SNAPSHOT -completedPhase=generate-release-poms +completedPhase=run-preparation-goals project.rel.eu.dnetlib\:dnet-pace-core=3.0.15 From 4712fef82f702d912e7374be6934664ffef1bab3 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 24 Oct 2019 11:11:07 +0200 Subject: [PATCH 10/13] release rollback --- dnet-dedup-test/dependency-reduced-pom.xml | 2 +- dnet-dedup-test/pom.xml | 2 +- dnet-pace-core/pom.xml | 2 +- pom.xml | 4 ++-- release.properties | 22 ---------------------- 5 files changed, 5 insertions(+), 27 deletions(-) delete mode 100644 release.properties diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml index a424d01..07b9268 100644 --- a/dnet-dedup-test/dependency-reduced-pom.xml +++ b/dnet-dedup-test/dependency-reduced-pom.xml @@ -3,7 +3,7 @@ dnet-dedup eu.dnetlib - 3.0.15 + 3.0.14-SNAPSHOT 4.0.0 dnet-dedup-test diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 9fb70b0..b2b0437 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.15-SNAPSHOT ../pom.xml diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 4f8aa1d..34138cc 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.15-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 9442f1f..4e9d3fe 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.15-SNAPSHOT pom @@ -22,7 +22,7 @@ scm:git:https://github.com/dnet-team/dnet-dedup.git - dnet-dedup-3.0.15 + HEAD diff --git a/release.properties b/release.properties deleted file mode 100644 index 907a9fc..0000000 --- a/release.properties +++ /dev/null @@ -1,22 +0,0 @@ -#release configuration -#Wed Oct 23 16:34:27 CEST 2019 -scm.commentPrefix=[maven-release-plugin] -pushChanges=true -project.rel.eu.dnetlib\:dnet-dedup-test=3.0.15 -scm.tag=dnet-dedup-3.0.15 -remoteTagging=true -project.scm.eu.dnetlib\:dnet-dedup-test.empty=true -projectVersionPolicyId=default -scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git -scm.tagNameFormat=@{project.artifactId}-@{project.version} -project.rel.eu.dnetlib\:dnet-dedup=3.0.15 -project.dev.eu.dnetlib\:dnet-pace-core=3.0.16-SNAPSHOT -preparationGoals=clean verify -project.scm.eu.dnetlib\:dnet-dedup.tag=HEAD -project.scm.eu.dnetlib\:dnet-dedup.developerConnection=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git -exec.snapshotReleasePluginAllowed=false -project.dev.eu.dnetlib\:dnet-dedup=3.0.16-SNAPSHOT -project.scm.eu.dnetlib\:dnet-pace-core.empty=true -project.dev.eu.dnetlib\:dnet-dedup-test=3.0.16-SNAPSHOT -completedPhase=run-preparation-goals -project.rel.eu.dnetlib\:dnet-pace-core=3.0.15 From 452ab7892dfaea474acbc034c4a7f7f78c97af3e Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 24 Oct 2019 11:17:07 +0200 Subject: [PATCH 11/13] [maven-release-plugin] prepare release dnet-dedup-3.0.15 --- dnet-dedup-test/pom.xml | 2 +- dnet-pace-core/pom.xml | 2 +- pom.xml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index b2b0437..9fb70b0 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 ../pom.xml diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 34138cc..4f8aa1d 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 ../pom.xml diff --git a/pom.xml b/pom.xml index 4e9d3fe..9442f1f 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 pom @@ -22,7 +22,7 @@ scm:git:https://github.com/dnet-team/dnet-dedup.git - HEAD + dnet-dedup-3.0.15 From 58f128d8613d420b3b824d1807b9213cb01f0b69 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 24 Oct 2019 11:23:01 +0200 Subject: [PATCH 12/13] Revert "[maven-release-plugin] prepare release dnet-dedup-3.0.15" This reverts commit 452ab7892dfaea474acbc034c4a7f7f78c97af3e. --- dnet-dedup-test/pom.xml | 2 +- dnet-pace-core/pom.xml | 2 +- pom.xml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 9fb70b0..b2b0437 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.15-SNAPSHOT ../pom.xml diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 4f8aa1d..34138cc 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.15-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 9442f1f..4e9d3fe 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.15-SNAPSHOT pom @@ -22,7 +22,7 @@ scm:git:https://github.com/dnet-team/dnet-dedup.git - dnet-dedup-3.0.15 + HEAD From 8dba7a04f8426faabf70b7db349761fa30095461 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 24 Oct 2019 11:28:20 +0200 Subject: [PATCH 13/13] dependency-reduced-pom deleted --- dnet-dedup-test/dependency-reduced-pom.xml | 119 --------------------- 1 file changed, 119 deletions(-) delete mode 100644 dnet-dedup-test/dependency-reduced-pom.xml diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml deleted file mode 100644 index 07b9268..0000000 --- a/dnet-dedup-test/dependency-reduced-pom.xml +++ /dev/null @@ -1,119 +0,0 @@ - - - - dnet-dedup - eu.dnetlib - 3.0.14-SNAPSHOT - - 4.0.0 - dnet-dedup-test - - - - maven-shade-plugin - 2.4.3 - - - package - - shade - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - - - maven-deploy-plugin - 2.7 - - true - - - - maven-compiler-plugin - - 1.8 - 1.8 - - **/*.java - - - - - net.alchim31.maven - scala-maven-plugin - 4.0.1 - - - scala-compile-first - initialize - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - ${scala.version} - - - - - - - junit - junit - 4.9 - test - - - hamcrest-core - org.hamcrest - - - - - org.apache.oozie - oozie-client - 5.1.0 - test - - - json-simple - com.googlecode.json-simple - - - jms - javax.jms - - - slf4j-simple - org.slf4j - - - oozie-fluent-job-api - org.apache.oozie - - - - - -