diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml
deleted file mode 100644
index 07b9268..0000000
--- a/dnet-dedup-test/dependency-reduced-pom.xml
+++ /dev/null
@@ -1,119 +0,0 @@
-
-
-
- dnet-dedup
- eu.dnetlib
- 3.0.14-SNAPSHOT
-
- 4.0.0
- dnet-dedup-test
-
-
-
- maven-shade-plugin
- 2.4.3
-
-
- package
-
- shade
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
-
-
-
- maven-deploy-plugin
- 2.7
-
- true
-
-
-
- maven-compiler-plugin
-
-
- 1.8
-
- **/*.java
-
-
-
-
- net.alchim31.maven
- scala-maven-plugin
- 4.0.1
-
-
- scala-compile-first
- initialize
-
- add-source
- compile
-
-
-
- scala-test-compile
- process-test-resources
-
- testCompile
-
-
-
-
- ${scala.version}
-
-
-
-
-
-
- junit
- junit
- 4.9
- test
-
-
- hamcrest-core
- org.hamcrest
-
-
-
-
- org.apache.oozie
- oozie-client
- 5.1.0
- test
-
-
- json-simple
- com.googlecode.json-simple
-
-
- jms
- javax.jms
-
-
- slf4j-simple
- org.slf4j
-
-
- oozie-fluent-job-api
- org.apache.oozie
-
-
-
-
-
-
diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml
index e5d429b..b2b0437 100644
--- a/dnet-dedup-test/pom.xml
+++ b/dnet-dedup-test/pom.xml
@@ -6,7 +6,7 @@
eu.dnetlib
dnet-dedup
- 3.0.14-SNAPSHOT
+ 3.0.15-SNAPSHOT
../pom.xml
diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java
index a0f4e48..0fcea0a 100644
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java
@@ -88,11 +88,11 @@ public class SparkLocalTest {
connectedComponents.foreach(cc -> {
System.out.println(cc);
});
- connectedComponents.foreach(cc -> {
- cc.getDocs().stream().forEach(d -> {
- System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname"));
- });
- });
+// connectedComponents.foreach(cc -> {
+// cc.getDocs().stream().forEach(d -> {
+// System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname"));
+// });
+// });
//print nondeduped
nonDeduplicated.foreach(cc -> {
System.out.println(cc);
diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf
index dd4c4bf..195cd98 100644
--- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf
+++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf
@@ -31,6 +31,112 @@
],
"blacklists" : {
"legalname" : []
+ },
+ "synonyms": {
+ "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
+ "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
+ "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
+ "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
+ "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
+ "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
+ "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
+ "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
+ "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
+ "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
+ "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
+ "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
+ "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
+ "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
+ "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
+ "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
+ "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
+ "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
+ "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
+ "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
+ "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
+ "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
+ "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
+ "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
+ "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
+ "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
+ "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
+ "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
+ "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
+ "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
+ "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
+ "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
+ "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
+ "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
+ "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
+ "key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
+ "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
+ "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
+ "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
+ "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
+ "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
+ "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
+ "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
+ "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
+ "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
+ "key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
+ "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
+ "key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
+ "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
+ "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
+ "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
+ "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
+ "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
+ "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
+ "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
+ "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
+ "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
+ "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
+ "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
+ "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
+ "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
+ "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
+ "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
+ "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
+ "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
+ "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
+ "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
+ "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
+ "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
+ "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
+ "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
+ "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
+ "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
+ "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
+ "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
+ "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
+ "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
+ "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
+ "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
+ "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
+ "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
+ "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
+ "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
+ "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
+ "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
+ "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
+ "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
+ "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
+ "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
+ "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
+ "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
+ "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
+ "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
+ "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
+ "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
+ "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
+ "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
+ "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
+ "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
+ "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
+ "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
+ "key::102": ["informatics","informatica","informática","informática","informatica",""],
+ "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
+ "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
}
}
}
\ No newline at end of file
diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json
index 1e335ed..948fcf9 100644
--- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json
+++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIOK"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6183d331a1920dd81b8c10620a8b3a8a"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIVEL"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::8f65fd4e764086db897cc648e9cbbaed"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Social Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::c69cffc4997b54bb2eb5ca6aebcda18b"}
@@ -9,4 +10,9 @@
{"dateoftransformation":"2018-11-20","originalId":["corda_______::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::755737ed505484ea374062762ef05ef6"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::755737ed505484ea374062762ef05ef6"}
{"dateoftransformation":"2018-09-13","originalId":["snsf________::The_Netherlands_Institute_of_Health_Services_Research_NIVEL"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute of Health Services Research NIVEL"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-22","type":20,"id":"20|snsf________::10653be4e9c170181486aa9782346d81"}
-{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"}
\ No newline at end of file
+{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"}
+=======
+{"dateoftransformation":"2018-11-20","originalId":["corda_______::999987066"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NATIONAAL LUCHT- EN RUIMTEVAARTLABORATORIUM"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::9cb56cf06fbe3926d0c88ee320908848"}
+{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999987066"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NATIONAAL LUCHT- EN RUIMTEVAARTLABORATORIUM"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::9cb56cf06fbe3926d0c88ee320908848"}
+{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Netherlands_Aerospace_Centre"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Netherlands Aerospace Centre"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::ce12359dec61a8e00837c3e507918812"}
+>>>>>>> origin/master
diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf
index 517abb0..56a39de 100644
--- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf
+++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf
@@ -15,7 +15,8 @@
"model" : [
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }
],
- "blacklists" : { }
+ "blacklists" : { },
+ "synonyms" : { }
}
}
\ No newline at end of file
diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java
index 702b4ab..a23d6dd 100644
--- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java
@@ -3,6 +3,7 @@ package eu.dnetlib.pace;
import org.apache.oozie.client.OozieClient;
import org.apache.oozie.client.OozieClientException;
import org.apache.oozie.client.WorkflowJob;
+import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
@@ -12,6 +13,7 @@ import static junit.framework.Assert.assertEquals;
public class DedupTestIT {
+ @Ignore
@Test
public void deduplicationTest() throws OozieClientException, InterruptedException {
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf
index be3a9bf..cee2fa1 100644
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf
@@ -1,27 +1,152 @@
{
"wf" : {
- "threshold" : "0.98",
+ "threshold" : "0.9",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
- "groupMaxSize" : "10",
+ "groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
+<<<<<<< HEAD
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
+=======
+ { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
+ { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
+ { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
+ { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
+ ],
+ "strictConditions" : [
+ { "name" : "exactMatch", "fields" : [ "gridid" ] }
+ ],
+ "conditions" : [
+ { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
+ { "name" : "exactMatch", "fields" : [ "country" ] }
+>>>>>>> origin/master
],
"model" : [
- { "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
- { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }
+ { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
+ { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
+ { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
+ { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
+ { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
- "blacklists" : { }
+ "blacklists" : {
+ "legalname" : []
+ },
+ "synonyms": {
+ "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
+ "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
+ "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
+ "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
+ "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
+ "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
+ "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
+ "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
+ "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
+ "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
+ "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
+ "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
+ "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
+ "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
+ "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
+ "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
+ "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
+ "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
+ "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
+ "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
+ "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
+ "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
+ "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
+ "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
+ "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
+ "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
+ "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
+ "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
+ "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
+ "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
+ "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
+ "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
+ "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
+ "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
+ "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
+ "key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
+ "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
+ "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
+ "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
+ "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
+ "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
+ "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
+ "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
+ "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
+ "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
+ "key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
+ "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
+ "key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
+ "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
+ "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
+ "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
+ "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
+ "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
+ "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
+ "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
+ "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
+ "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
+ "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
+ "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
+ "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
+ "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
+ "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
+ "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
+ "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
+ "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
+ "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
+ "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
+ "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
+ "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
+ "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
+ "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
+ "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
+ "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
+ "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
+ "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
+ "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
+ "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
+ "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
+ "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
+ "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
+ "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
+ "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
+ "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
+ "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
+ "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
+ "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
+ "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
+ "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
+ "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
+ "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
+ "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
+ "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
+ "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
+ "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
+ "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
+ "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
+ "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
+ "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
+ "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
+ "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
+ "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
+ "key::102": ["informatics","informatica","informática","informática","informatica"],
+ "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
+ "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
+ }
}
}
\ No newline at end of file
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf
index 5b67978..af5d545 100644
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf
@@ -19,7 +19,8 @@
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "authors", "algo" : "SortedLevel2JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
],
- "blacklists" : { }
+ "blacklists" : { },
+ "synonyms" : { }
}
}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf
index cb70c63..2f61ae6 100644
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf
@@ -45,7 +45,9 @@
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
- ] }
+ ] },
+ "synonyms" : {
+ }
}
}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf
index 992d57e..1111a19 100644
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf
@@ -23,7 +23,8 @@
{ "name" : "title", "algo" : "Level2Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" }
],
- "blacklists" : { }
+ "blacklists" : { },
+ "synonyms" : { }
}
}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf
index b3284ce..72ca533 100644
--- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf
@@ -15,7 +15,8 @@
"model" : [
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }
],
- "blacklists" : { }
+ "blacklists" : { } ,
+ "synonyms" : { }
}
}
\ No newline at end of file
diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml
index 9499750..34138cc 100644
--- a/dnet-pace-core/pom.xml
+++ b/dnet-pace-core/pom.xml
@@ -6,7 +6,7 @@
eu.dnetlib
dnet-dedup
- 3.0.14-SNAPSHOT
+ 3.0.15-SNAPSHOT
../pom.xml
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
index 1782b87..7fdcce4 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
@@ -18,15 +19,15 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
this.params = params;
}
- protected abstract Collection doApply(String s);
+ protected abstract Collection doApply(Config conf, String s);
@Override
- public Collection apply(List fields) {
+ public Collection apply(Config conf, List fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::normalize)
.map(s -> filterAllStopWords(s))
- .map(this::doApply)
+ .map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
index ee5efc9..d300833 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
@@ -6,6 +6,7 @@ import java.util.Set;
import java.util.StringTokenizer;
import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
@ClusteringClass("acronyms")
public class Acronyms extends AbstractClusteringFunction {
@@ -15,7 +16,7 @@ public class Acronyms extends AbstractClusteringFunction {
}
@Override
- protected Collection doApply(String s) {
+ protected Collection doApply(Config conf, String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
index a4b58aa..52859b4 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
@@ -13,15 +13,15 @@ import eu.dnetlib.pace.model.Field;
public class ClusteringCombiner {
public static Collection combine(final Document a, final Config conf) {
- return new ClusteringCombiner().doCombine(a, conf.clusterings());
+ return new ClusteringCombiner().doCombine(a, conf);
}
- private Collection doCombine(final Document a, final List defs) {
+ private Collection doCombine(final Document a, final Config conf) {
final Collection res = Sets.newLinkedHashSet();
- for (final ClusteringDef cd : defs) {
+ for (final ClusteringDef cd : conf.clusterings()) {
for (final String fieldName : cd.getFields()) {
final Field values = a.values(fieldName);
- res.addAll(cd.clusteringFunction().apply((List) values));
+ res.addAll(cd.clusteringFunction().apply(conf, (List) values));
}
}
return res;
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
index 4fe1b59..0554d27 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
@@ -4,11 +4,12 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
public interface ClusteringFunction {
- public Collection apply(List fields);
+ public Collection apply(Config config, List fields);
public Map getParams();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
index fab8e98..7f342f6 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
@ClusteringClass("immutablefieldvalue")
public class ImmutableFieldValue extends AbstractClusteringFunction {
@@ -14,7 +15,7 @@ public class ImmutableFieldValue extends AbstractClusteringFunction {
}
@Override
- protected Collection doApply(final String s) {
+ protected Collection doApply(final Config conf, final String s) {
final List res = Lists.newArrayList();
res.add(s);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
index 1cabecd..769ecf5 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
@@ -15,16 +16,16 @@ public class KeywordsClustering extends AbstractClusteringFunction {
}
@Override
- protected Collection doApply(String s) {
+ protected Collection doApply(final Config conf, String s) {
//takes city codes and keywords codes without duplicates
- Set keywords = getKeywords(s, params.getOrDefault("windowSize", 4));
+ Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
Set cities = getCities(s, params.getOrDefault("windowSize", 4));
//list of combination to return as result
final Collection combinations = new LinkedHashSet();
- for (String keyword: keywordsToCodes(keywords)){
+ for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
for (String city: citiesToCodes(cities)) {
combinations.add(keyword+"-"+city);
if (combinations.size()>=params.getOrDefault("max", 2)) {
@@ -37,13 +38,13 @@ public class KeywordsClustering extends AbstractClusteringFunction {
}
@Override
- public Collection apply(List fields) {
+ public Collection apply(final Config conf, List fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
.map(this::normalize)
.map(s -> filterAllStopWords(s))
- .map(this::doApply)
+ .map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
index 5ec8590..6fe525f 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
@@ -6,6 +6,7 @@ import java.util.Map;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
@@ -17,16 +18,16 @@ public class LowercaseClustering extends AbstractClusteringFunction {
}
@Override
- public Collection apply(List fields) {
+ public Collection apply(Config conf, List fields) {
Collection c = Sets.newLinkedHashSet();
for(Field f : fields) {
- c.addAll(doApply(f.stringValue()));
+ c.addAll(doApply(conf, f.stringValue()));
}
return c;
}
@Override
- protected Collection doApply(final String s) {
+ protected Collection doApply(final Config conf, final String s) {
if(StringUtils.isBlank(s)) {
return Lists.newArrayList();
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
index 06885be..baa30d7 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
@@ -6,6 +6,7 @@ import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrampairs")
public class NgramPairs extends Ngrams {
@@ -15,7 +16,7 @@ public class NgramPairs extends Ngrams {
}
@Override
- protected Collection doApply(String s) {
+ protected Collection doApply(Config conf, String s) {
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
index 8549468..214b145 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
@@ -1,5 +1,7 @@
package eu.dnetlib.pace.clustering;
+import eu.dnetlib.pace.config.Config;
+
import java.util.*;
@ClusteringClass("ngrams")
@@ -10,7 +12,7 @@ public class Ngrams extends AbstractClusteringFunction {
}
@Override
- protected Collection doApply(String s) {
+ protected Collection doApply(Config conf, String s) {
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
index 718b88d..26b07f0 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
@@ -2,6 +2,7 @@ package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang.StringUtils;
@@ -23,7 +24,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
}
@Override
- public Collection apply(final List fields) {
+ public Collection apply(final Config conf, final List fields) {
final Set hashes = Sets.newHashSet();
for (final Field f : fields) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
index fcb01b9..2020a66 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
@@ -6,6 +6,7 @@ import java.util.Map;
import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personhash")
@@ -18,7 +19,7 @@ public class PersonHash extends AbstractClusteringFunction {
}
@Override
- protected Collection doApply(final String s) {
+ protected Collection doApply(final Config conf, final String s) {
final List res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
index f012aac..c485fcb 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
@@ -1,5 +1,7 @@
package eu.dnetlib.pace.clustering;
+import eu.dnetlib.pace.config.Config;
+
import java.util.Collection;
import java.util.Map;
@@ -10,7 +12,7 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
}
@Override
- protected Collection doApply(String s) {
+ protected Collection doApply(final Config conf, String s) {
// TODO Auto-generated method stub
return null;
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
index 2f475fe..55b203d 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
@@ -5,6 +5,7 @@ import java.util.*;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
@ClusteringClass("sortedngrampairs")
public class SortedNgramPairs extends NgramPairs {
@@ -14,7 +15,7 @@ public class SortedNgramPairs extends NgramPairs {
}
@Override
- protected Collection doApply(String s) {
+ protected Collection doApply(Config conf, String s) {
final List tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
index 22dc490..fd8e7a3 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
@@ -4,6 +4,7 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
+import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.StringUtils;
@@ -17,7 +18,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
}
@Override
- protected Collection doApply(final String s) {
+ protected Collection doApply(final Config conf, final String s) {
final List res = Lists.newArrayList();
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
index 3960331..fa1f643 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
@@ -5,6 +5,7 @@ import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
@ClusteringClass("suffixprefix")
public class SuffixPrefix extends AbstractClusteringFunction {
@@ -14,7 +15,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
}
@Override
- protected Collection doApply(String s) {
+ protected Collection doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max"));
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
index 9955d5f..feb60a2 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import java.net.MalformedURLException;
@@ -21,7 +22,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
}
@Override
- public Collection apply(List fields) {
+ public Collection apply(final Config conf, List fields) {
try {
return fields.stream()
.filter(f -> !f.isEmpty())
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
index e453604..23ff7ac 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -13,6 +13,8 @@ import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
+import java.io.IOException;
+import java.io.StringWriter;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
@@ -25,7 +27,6 @@ import java.util.stream.Collectors;
*/
public abstract class AbstractPaceFunctions {
- private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
@@ -238,10 +239,10 @@ public abstract class AbstractPaceFunctions {
}
- public double keywordsCompare(Set s1, Set s2){
+ public double keywordsCompare(Set s1, Set s2, Map translationMap){
- Set k1 = keywordsToCodes(s1);
- Set k2 = keywordsToCodes(s2);
+ Set k1 = keywordsToCodes(s1, translationMap);
+ Set k2 = keywordsToCodes(s2, translationMap);
int longer = (k1.size()>k2.size())?k1.size():k2.size();
@@ -273,7 +274,7 @@ public abstract class AbstractPaceFunctions {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
}
- public Set keywordsToCodes(Set keywords) {
+ public Set keywordsToCodes(Set keywords, Map translationMap) {
return toCodes(keywords, translationMap);
}
@@ -324,12 +325,17 @@ public abstract class AbstractPaceFunctions {
return codes;
}
- public Set getKeywords(String s1, int windowSize) {
- return getKeywords(s1, translationMap, windowSize);
- }
-
public Set getCities(String s1, int windowSize) {
return getKeywords(s1, cityMap, windowSize);
}
+ public static String readFromClasspath(final String filename, final Class clazz) {
+ final StringWriter sw = new StringWriter();
+ try {
+ IOUtils.copy(clazz.getResourceAsStream(filename), sw);
+ return sw.toString();
+ } catch (final IOException e) {
+ throw new RuntimeException("cannot load resource from classpath: " + filename);
+ }
+ }
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
index 91c1096..2cdace1 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
@@ -49,4 +49,6 @@ public interface Config {
*/
public Map> blacklists();
+
+ public Map translationMap();
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
index f369df3..beab7a8 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
@@ -55,6 +55,7 @@ public class DedupConfig implements Config, Serializable {
try {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel();
+ config.getPace().initTranslationMap();
return config;
} catch (IOException e) {
throw new PaceException("Error in parsing configuration json", e);
@@ -139,4 +140,9 @@ public class DedupConfig implements Config, Serializable {
return getPace().getBlacklists();
}
+ @Override
+ public Map translationMap() {
+ return getPace().translationMap();
+ }
+
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
index 735af2c..1d4a03e 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.config;
import com.google.common.collect.Maps;
+
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
@@ -8,6 +9,7 @@ import eu.dnetlib.pace.util.PaceResolver;
import org.codehaus.jackson.annotate.JsonIgnore;
import java.io.Serializable;
+import java.text.Normalizer;
import java.util.List;
import java.util.Map;
@@ -19,6 +21,10 @@ public class PaceConfig implements Serializable {
private Map decisionTree;
private Map> blacklists;
+ private Map> synonyms;
+
+ @JsonIgnore
+ private Map translationMap;
@JsonIgnore
private Map modelMap;
@@ -30,11 +36,26 @@ public class PaceConfig implements Serializable {
public void initModel() {
modelMap = Maps.newHashMap();
- for(FieldDef fd : getModel()) {
+ for (FieldDef fd : getModel()) {
modelMap.put(fd.getName(), fd);
}
}
+ public void initTranslationMap(){
+ translationMap = Maps.newHashMap();
+ for (String key : synonyms.keySet()) {
+ for (String term : synonyms.get(key)){
+ translationMap.put(
+ Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD),
+ key);
+ }
+ }
+ }
+
+ public Map translationMap(){
+ return translationMap;
+ }
+
public List getModel() {
return model;
}
@@ -67,6 +88,14 @@ public class PaceConfig implements Serializable {
this.blacklists = blacklists;
}
+ public Map> getSynonyms() {
+ return synonyms;
+ }
+
+ public void setSynonyms(Map> synonyms) {
+ this.synonyms = synonyms;
+ }
+
public Map getModelMap() {
return modelMap;
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
index ff1cd97..d5a33ea 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -23,7 +24,7 @@ public class AlwaysMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b) {
+ public double compare(final Field a, final Field b, final Config conf) {
return 1.0;
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
index 5a844de..11f628d 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -22,7 +23,7 @@ public class ExactMatch extends AbstractComparator {
}
@Override
- public double distance(final String a, final String b) {
+ public double distance(final String a, final String b, final Config conf) {
return a.equals(b) ? 1.0 : 0;
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
index d51a1bd..4cfe048 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -14,7 +15,7 @@ public class ExactMatchIgnoreCase extends AbstractComparator {
}
@Override
- public double compare(Field a, Field b) {
+ public double compare(Field a, Field b, final Config conf) {
final String fa = getValue(a);
final String fb = getValue(b);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java
index 76f1fd2..0af0a80 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -23,7 +24,7 @@ public class JaroWinkler extends AbstractComparator {
}
@Override
- public double distance(String a, String b) {
+ public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
index 85f657f..4c16780 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
@@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.config.Config;
+
import java.util.Map;
import java.util.Set;
@@ -26,7 +28,7 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
}
@Override
- public double distance(String a, String b) {
+ public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
@@ -36,15 +38,15 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
- Set keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
- Set keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
+ Set keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
+ Set keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
if (sameCity(cities1,cities2)) {
- if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
+ if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java
index 23c4cfa..8556eae 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java
@@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.config.Config;
+
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@@ -23,7 +25,7 @@ public class JaroWinklerTitle extends AbstractComparator {
}
@Override
- public double distance(String a, String b) {
+ public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java
index 6ddd2c9..6a19e66 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java
@@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.config.Config;
import java.util.Map;
@@ -22,7 +23,7 @@ public class Level2JaroWinklerTitle extends AbstractComparator {
}
@Override
- public double distance(final String a, final String b) {
+ public double distance(final String a, final String b, final Config conf) {
final String ca = cleanup(a);
final String cb = cleanup(b);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java
index 0d444e1..546284e 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java
@@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.config.Config;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -26,7 +28,7 @@ public class LevensteinTitle extends AbstractComparator {
}
@Override
- public double distance(final String a, final String b) {
+ public double distance(final String a, final String b, final Config conf) {
final String ca = cleanup(a);
final String cb = cleanup(b);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java
index 9413eb5..83b80b1 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java
@@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.config.Config;
+
import java.util.Map;
@@ -25,7 +27,7 @@ public class LevensteinTitleIgnoreVersion extends AbstractComparator {
}
@Override
- public double distance(final String a, final String b) {
+ public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java
index 4b2e707..01d77b3 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java
@@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.config.Config;
import java.util.Map;
@@ -22,7 +23,7 @@ public class MustBeDifferent extends AbstractComparator {
}
@Override
- public double distance(final String a, final String b) {
+ public double distance(final String a, final String b, final Config conf) {
return !a.equals(b) ? 1.0 : 0;
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java
index 47d45a1..e65ac71 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -17,7 +18,7 @@ public class NullDistanceAlgo implements Comparator {
}
@Override
- public double compare(Field a, Field b) {
+ public double compare(Field a, Field b, Config config) {
return 0;
}
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java
index 519c28d..14845da 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.adaptor.Pid;
@@ -27,7 +28,7 @@ public class PidMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b) {
+ public double compare(final Field a, final Field b, final Config conf) {
final List sa = ((FieldList) a).stringList();
final List sb = ((FieldList) b).stringList();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java
index e79f918..91f1e35 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java
@@ -5,6 +5,7 @@ import java.util.Map;
import com.google.common.collect.Iterables;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -28,7 +29,7 @@ public class SizeMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b) {
+ public double compare(final Field a, final Field b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java
index 2ea7bd8..004fc90 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java
@@ -1,5 +1,8 @@
package eu.dnetlib.pace.tree;
+import eu.dnetlib.pace.config.Config;
+import org.apache.commons.lang.StringUtils;
+
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
@@ -67,9 +70,9 @@ public class SubStringLevenstein extends AbstractComparator {
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
- public double compare(final Field a, final Field b) {
+ public double distance(final Field a, final Field b, final Config conf) {
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
- return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
+ return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf);
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java
index 6643262..fd86b17 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java
@@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -21,7 +22,7 @@ public class TitleVersionMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b) {
+ public double compare(final Field a, final Field b, final Config conf) {
final String valueA = getFirstValue(a);
final String valueB = getFirstValue(b);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java
index 225f9ca..8f36126 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang.StringUtils;
@@ -28,8 +29,7 @@ public class UrlMatcher extends Levenstein {
}
@Override
- public double compare(Field a, Field b) {
-
+ public double distance(Field a, Field b, final Config conf) {
final URL urlA = asUrl(getFirstValue(a));
final URL urlB = asUrl(getFirstValue(b));
@@ -44,7 +44,7 @@ public class UrlMatcher extends Levenstein {
return hostW * 0.5;
}
- return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
+ return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
}
private URL asUrl(final String value) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java
index fbb0263..f57cba5 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -22,7 +23,7 @@ public class YearMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b) {
+ public double compare(final Field a, final Field b, final Config conf) {
final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b));
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java
index f6ad137..1ed14b0 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java
@@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
@@ -66,7 +67,7 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
* the b
* @return the double
*/
- public double distance(final String a, final String b) {
+ public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing
@@ -84,16 +85,23 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
* the b
* @return the double
*/
- protected double distance(final List a, final List b) {
- return distance(concat(a), concat(b));
+ protected double distance(final List a, final List b, final Config conf) {
+ return distance(concat(a), concat(b), conf);
+ }
+
+ public double distance(final Field a, final Field b, final Config conf) {
+ if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
+ if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
+
+ throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
@Override
- public double compare(final Field a, final Field b) {
+ public double compare(final Field a, final Field b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
- if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
- if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
+ if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
+ if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java
index 64ff4f3..8524996 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree.support;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
public interface Comparator {
@@ -8,6 +9,6 @@ public interface Comparator {
* return : -1 -> can't decide (missing field)
* >0 -> similarity degree (depends on the algorithm)
* */
- public double compare(Field a, Field b);
+ public double compare(Field a, Field b, Config conf);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
index 7294536..1d187b0 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
@@ -1,8 +1,10 @@
package eu.dnetlib.pace.tree.support;
+import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
+import org.codehaus.jackson.annotate.JsonIgnore;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
@@ -35,7 +37,7 @@ public class TreeNodeDef implements Serializable {
public TreeNodeDef() {
}
- public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2) {
+ public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats();
stats.setFieldsCount(fields.size());
@@ -44,7 +46,7 @@ public class TreeNodeDef implements Serializable {
double weight = fieldConf.getWeight();
- double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
+ double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
if (result == -1) { //if the field is missing
stats.incrementMissCount();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java
index 794511a..70e9623 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java
@@ -38,7 +38,7 @@ public class TreeProcessor {
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current);
- TreeNodeStats stats = currentNode.evaluate(doc1, doc2);
+ TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
current = currentNode.getUndefined();
diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv
index c74b357..e97fd52 100644
--- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv
+++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv
@@ -1,4 +1,4 @@
-key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
+key::1;university;universita;universita studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
@@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
-key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
+key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;universidad politecnica;universitat politecnica;politechnika;politechniki;university technology;university science technology
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
index 84ec090..6eec8cb 100644
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
@@ -1,22 +1,24 @@
package eu.dnetlib.pace.clustering;
-import java.util.Map;
-
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
-import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.config.DedupConfig;
import org.junit.Before;
import org.junit.Test;
+import java.util.Map;
+
public class ClusteringFunctionTest extends AbstractPaceTest {
private Map params;
+ DedupConfig conf;
@Before
public void setUp() throws Exception {
params = Maps.newHashMap();
+ conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class));
}
@Test
@@ -26,7 +28,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "http://www.test.it/path/to/resource";
System.out.println(s);
- System.out.println(urlClustering.apply(Lists.newArrayList(url(s))));
+ System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s))));
}
@Test
@@ -40,7 +42,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(ngram.apply(Lists.newArrayList(title(s))));
+ System.out.println(ngram.apply(conf, Lists.newArrayList(title(s))));
}
@Test
@@ -52,7 +54,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(np.apply(Lists.newArrayList(title(s))));
+ System.out.println(np.apply(conf, Lists.newArrayList(title(s))));
}
@Test
@@ -64,11 +66,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s1 = "University of Pisa";
System.out.println(s1);
- System.out.println(np.apply(Lists.newArrayList(title(s1))));
+ System.out.println(np.apply(conf, Lists.newArrayList(title(s1))));
final String s2 = "Pisa University";
System.out.println(s2);
- System.out.println(np.apply(Lists.newArrayList(title(s2))));
+ System.out.println(np.apply(conf, Lists.newArrayList(title(s2))));
}
@Test
@@ -81,7 +83,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(acro.apply(Lists.newArrayList(title(s))));
+ System.out.println(acro.apply(conf, Lists.newArrayList(title(s))));
}
@Test
@@ -93,7 +95,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(sp.apply(Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
}
@Test
@@ -105,7 +107,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(sp.apply(Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
}
@Test
@@ -114,7 +116,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = readFromClasspath("gt.author.json");
System.out.println(s);
- System.out.println(cf.apply(Lists.newArrayList(person(s))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(person(s))));
}
@Test
@@ -123,27 +125,27 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final ClusteringFunction cf = new KeywordsClustering(params);
final String s = "Polytechnic University of Turin";
System.out.println(s);
- System.out.println(cf.apply(Lists.newArrayList(title(s))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "POLITECNICO DI TORINO";
System.out.println(s1);
- System.out.println(cf.apply(Lists.newArrayList(title(s1))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
final String s2 = "Universita farmaceutica culturale di milano bergamo";
System.out.println("s2 = " + s2);
- System.out.println(cf.apply(Lists.newArrayList(title(s2))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(title(s2))));
final String s3 = "universita universita milano milano";
System.out.println("s3 = " + s3);
- System.out.println(cf.apply(Lists.newArrayList(title(s3))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(title(s3))));
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
System.out.println("s4 = " + s4);
- System.out.println(cf.apply(Lists.newArrayList(title(s4))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(title(s4))));
final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5);
- System.out.println(cf.apply(Lists.newArrayList(title(s5))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
}
diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java
index 1004203..c3dc482 100644
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java
@@ -2,6 +2,7 @@ package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
+import eu.dnetlib.pace.config.DedupConfig;
import org.junit.Before;
import org.junit.Test;
@@ -17,13 +18,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
private Map params;
+ private DedupConfig conf;
@Before
public void setup() {
- System.out.println("****************************************************************");
- System.out.println("Test String : " + TEST_STRING);
params = new HashMap<>();
params.put("weight", 1.0);
+ conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class));
}
@Test
@@ -55,7 +56,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
@Test
public void testJaroWinklerNormalizedName() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State");
+ double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf);
System.out.println("result = " + result);
assertEquals(0.0, result);
@@ -65,49 +66,49 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void testJaroWinklerNormalizedName2() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York");
+ double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf);
- assertEquals(result, 1.0);
+ assertEquals(1.0, result);
}
@Test
public void testJaroWinklerNormalizedName3() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna");
+ double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf);
System.out.println("result = " + result);
- assertEquals(result, 0.0);
+ assertEquals(0.0, result);
}
@Test
public void testJaroWinklerNormalizedName4() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa");
+ double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf);
System.out.println("result = " + result);
- assertEquals(result, 1.0);
+ assertEquals(1.0, result);
}
@Test
public void testJaroWinklerNormalizedName5() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS");
+ double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf);
System.out.println("result = " + result);
- assertEquals(result, 1.0);
+ assertEquals(1.0, result);
}
@Test
public void testJaroWinklerNormalizedName6() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung");
+ double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
System.out.println("result = " + result);
- assertTrue(result> 0.9);
+ assertTrue(result > 0.9);
}
@@ -115,17 +116,17 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void testJaroWinklerNormalizedName7() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO");
+ double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf);
System.out.println("result = " + result);
- assertTrue(result> 0.9);
+ assertTrue(result > 0.9);
}
@Test
public void testJaroWinklerNormalizedName8() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology");
+ double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf);
System.out.println("result = " + result);
}
@@ -134,7 +135,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void testJaroWinklerNormalizedName9() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti");
+ double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf);
System.out.println("result = " + result);
}
@@ -144,7 +145,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
- double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence");
+ double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
System.out.println("result = " + result);
}
diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
index 575b3c7..9051049 100644
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
@@ -5,12 +5,13 @@ import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
public class ConfigTest extends AbstractPaceTest {
@Test
public void dedupConfigSerializationTest() {
- final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
+ final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf"));
final String conf = cfgFromClasspath.toString();
@@ -37,4 +38,20 @@ public class ConfigTest extends AbstractPaceTest {
System.out.println(load.toString());
}
+ @Test
+ public void translationMapTest() {
+
+ DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf"));
+
+ System.out.println("translationMap = " + load.getPace().translationMap().toString());
+ }
+
+ @Test
+ public void emptyTranslationMapTest() {
+
+ DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf"));
+
+ assertEquals(0, load.getPace().translationMap().keySet().size());
+ }
+
}
diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf
index a5b28ce..017cbab 100644
--- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf
+++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf
@@ -5,32 +5,152 @@
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
- "groupMaxSize" : "10",
+ "groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
+<<<<<<< HEAD
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
+=======
+ { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
+ { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
+ { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
+ { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
+>>>>>>> origin/master
],
"sufficientConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
],
+<<<<<<< HEAD
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
+=======
+ "conditions" : [
+ { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
+ { "name" : "exactMatch", "fields" : [ "country" ] }
+>>>>>>> origin/master
],
"model" : [
- { "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
- { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
+ { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
- { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} },
+ { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
- "blacklists" : { }
+ "blacklists" : {
+ "legalname" : []
+ },
+ "synonyms": {
+ "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
+ "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
+ "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
+ "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
+ "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
+ "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
+ "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
+ "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
+ "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
+ "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
+ "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
+ "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
+ "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
+ "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
+ "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
+ "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
+ "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
+ "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
+ "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
+ "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
+ "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
+ "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
+ "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
+ "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
+ "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
+ "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
+ "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
+ "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
+ "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
+ "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
+ "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
+ "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
+ "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
+ "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
+ "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
+ "key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
+ "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
+ "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
+ "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
+ "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
+ "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
+ "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
+ "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
+ "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
+ "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
+ "key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
+ "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
+ "key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
+ "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
+ "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
+ "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
+ "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
+ "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
+ "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
+ "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
+ "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
+ "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
+ "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
+ "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
+ "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
+ "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
+ "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
+ "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
+ "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
+ "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
+ "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
+ "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
+ "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
+ "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
+ "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
+ "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
+ "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
+ "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
+ "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
+ "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
+ "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
+ "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
+ "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
+ "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
+ "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
+ "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
+ "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
+ "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
+ "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
+ "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
+ "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
+ "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
+ "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
+ "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
+ "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
+ "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
+ "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
+ "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
+ "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
+ "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
+ "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
+ "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
+ "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
+ "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
+ "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
+ "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
+ "key::102": ["informatics","informatica","informática","informática","informatica"],
+ "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
+ "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
+ }
}
}
\ No newline at end of file
diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf
new file mode 100644
index 0000000..3af6585
--- /dev/null
+++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf
@@ -0,0 +1,40 @@
+{
+ "wf" : {
+ "threshold" : "0.9",
+ "dedupRun" : "001",
+ "entityType" : "organization",
+ "orderField" : "legalname",
+ "queueMaxSize" : "2000",
+ "groupMaxSize" : "50",
+ "slidingWindowSize" : "200",
+ "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
+ "includeChildren" : "true"
+ },
+ "pace" : {
+ "clustering" : [
+ { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
+ { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
+ { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
+ { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
+ ],
+ "strictConditions" : [
+ { "name" : "exactMatch", "fields" : [ "gridid" ] }
+ ],
+ "conditions" : [
+ { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
+ { "name" : "exactMatch", "fields" : [ "country" ] }
+ ],
+ "model" : [
+ { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
+ { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
+ { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
+ { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
+ { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
+ ],
+ "blacklists" : {
+ "legalname" : []
+ },
+ "synonyms": {
+ }
+ }
+}
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 57ca8d6..4e9d3fe 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
eu.dnetlib
dnet-dedup
- 3.0.14-SNAPSHOT
+ 3.0.15-SNAPSHOT
pom
diff --git a/release.properties b/release.properties
deleted file mode 100644
index 5c101a5..0000000
--- a/release.properties
+++ /dev/null
@@ -1,11 +0,0 @@
-#release configuration
-#Mon Jul 08 10:03:15 CEST 2019
-scm.tagNameFormat=@{project.artifactId}-@{project.version}
-pushChanges=true
-scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git
-preparationGoals=clean verify
-projectVersionPolicyId=default
-remoteTagging=true
-scm.commentPrefix=[maven-release-plugin]
-exec.snapshotReleasePluginAllowed=false
-completedPhase=create-backup-poms