diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml deleted file mode 100644 index 07b9268..0000000 --- a/dnet-dedup-test/dependency-reduced-pom.xml +++ /dev/null @@ -1,119 +0,0 @@ - - - - dnet-dedup - eu.dnetlib - 3.0.14-SNAPSHOT - - 4.0.0 - dnet-dedup-test - - - - maven-shade-plugin - 2.4.3 - - - package - - shade - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - - - maven-deploy-plugin - 2.7 - - true - - - - maven-compiler-plugin - - 1.8 - 1.8 - - **/*.java - - - - - net.alchim31.maven - scala-maven-plugin - 4.0.1 - - - scala-compile-first - initialize - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - ${scala.version} - - - - - - - junit - junit - 4.9 - test - - - hamcrest-core - org.hamcrest - - - - - org.apache.oozie - oozie-client - 5.1.0 - test - - - json-simple - com.googlecode.json-simple - - - jms - javax.jms - - - slf4j-simple - org.slf4j - - - oozie-fluent-job-api - org.apache.oozie - - - - - - diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index e5d429b..b2b0437 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.14-SNAPSHOT + 3.0.15-SNAPSHOT ../pom.xml diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java index a0f4e48..0fcea0a 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java @@ -88,11 +88,11 @@ public class SparkLocalTest { connectedComponents.foreach(cc -> { System.out.println(cc); }); - connectedComponents.foreach(cc -> { - cc.getDocs().stream().forEach(d -> { - System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname")); - }); - }); +// connectedComponents.foreach(cc -> { +// cc.getDocs().stream().forEach(d -> { +// System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname")); +// }); +// }); //print nondeduped nonDeduplicated.foreach(cc -> { System.out.println(cc); diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf index dd4c4bf..195cd98 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf @@ -31,6 +31,112 @@ ], "blacklists" : { "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""], + "key::102": ["informatics","informatica","informática","informática","informatica",""], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] } } } \ No newline at end of file diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json index 1e335ed..948fcf9 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json @@ -1,3 +1,4 @@ +<<<<<<< HEAD {"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIOK"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6183d331a1920dd81b8c10620a8b3a8a"} {"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIVEL"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::8f65fd4e764086db897cc648e9cbbaed"} {"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Social Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::c69cffc4997b54bb2eb5ca6aebcda18b"} @@ -9,4 +10,9 @@ {"dateoftransformation":"2018-11-20","originalId":["corda_______::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::755737ed505484ea374062762ef05ef6"} {"dateoftransformation":"2019-06-26","originalId":["corda__h2020::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::755737ed505484ea374062762ef05ef6"} {"dateoftransformation":"2018-09-13","originalId":["snsf________::The_Netherlands_Institute_of_Health_Services_Research_NIVEL"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute of Health Services Research NIVEL"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-22","type":20,"id":"20|snsf________::10653be4e9c170181486aa9782346d81"} -{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"} \ No newline at end of file +{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"} +======= +{"dateoftransformation":"2018-11-20","originalId":["corda_______::999987066"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NATIONAAL LUCHT- EN RUIMTEVAARTLABORATORIUM"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::9cb56cf06fbe3926d0c88ee320908848"} +{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999987066"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NATIONAAL LUCHT- EN RUIMTEVAARTLABORATORIUM"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::9cb56cf06fbe3926d0c88ee320908848"} +{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Netherlands_Aerospace_Centre"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Netherlands Aerospace Centre"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::ce12359dec61a8e00837c3e507918812"} +>>>>>>> origin/master diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf index 517abb0..56a39de 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.simple.pace.conf @@ -15,7 +15,8 @@ "model" : [ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" } ], - "blacklists" : { } + "blacklists" : { }, + "synonyms" : { } } } \ No newline at end of file diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java index 702b4ab..a23d6dd 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace; import org.apache.oozie.client.OozieClient; import org.apache.oozie.client.OozieClientException; import org.apache.oozie.client.WorkflowJob; +import org.junit.Ignore; import org.junit.Test; import java.io.IOException; @@ -12,6 +13,7 @@ import static junit.framework.Assert.assertEquals; public class DedupTestIT { + @Ignore @Test public void deduplicationTest() throws OozieClientException, InterruptedException { diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf index be3a9bf..cee2fa1 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf @@ -1,27 +1,152 @@ { "wf" : { - "threshold" : "0.98", + "threshold" : "0.9", "dedupRun" : "001", "entityType" : "organization", "orderField" : "legalname", "queueMaxSize" : "2000", - "groupMaxSize" : "10", + "groupMaxSize" : "50", "slidingWindowSize" : "200", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true" }, "pace" : { "clustering" : [ +<<<<<<< HEAD { "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } } ], "necessaryConditions" : [ { "name" : "exactMatch", "fieldsCount" : [ "country" ] } +======= + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "gridid" ] } + ], + "conditions" : [ + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, + { "name" : "exactMatch", "fields" : [ "country" ] } +>>>>>>> origin/master ], "model" : [ - { "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" } + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } ], - "blacklists" : { } + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"], + "key::102": ["informatics","informatica","informática","informática","informatica"], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] + } } } \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf index 5b67978..af5d545 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf @@ -19,7 +19,8 @@ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, { "name" : "authors", "algo" : "SortedLevel2JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" } ], - "blacklists" : { } + "blacklists" : { }, + "synonyms" : { } } } diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf index cb70c63..2f61ae6 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf @@ -45,7 +45,9 @@ "^(WHP Cruise Summary Information of section).*$", "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", "^(Measurement of the spin\\-dependent structure function).*" - ] } + ] }, + "synonyms" : { + } } } diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf index 992d57e..1111a19 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf @@ -23,7 +23,8 @@ { "name" : "title", "algo" : "Level2Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ], - "blacklists" : { } + "blacklists" : { }, + "synonyms" : { } } } diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf index b3284ce..72ca533 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf @@ -15,7 +15,8 @@ "model" : [ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" } ], - "blacklists" : { } + "blacklists" : { } , + "synonyms" : { } } } \ No newline at end of file diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 9499750..34138cc 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.14-SNAPSHOT + 3.0.15-SNAPSHOT ../pom.xml diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index 1782b87..7fdcce4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -18,15 +19,15 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i this.params = params; } - protected abstract Collection doApply(String s); + protected abstract Collection doApply(Config conf, String s); @Override - public Collection apply(List fields) { + public Collection apply(Config conf, List fields) { return fields.stream().filter(f -> !f.isEmpty()) .map(Field::stringValue) .map(this::normalize) .map(s -> filterAllStopWords(s)) - .map(this::doApply) + .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) .filter(StringUtils::isNotBlank) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java index ee5efc9..d300833 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -6,6 +6,7 @@ import java.util.Set; import java.util.StringTokenizer; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; @ClusteringClass("acronyms") public class Acronyms extends AbstractClusteringFunction { @@ -15,7 +16,7 @@ public class Acronyms extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java index a4b58aa..52859b4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java @@ -13,15 +13,15 @@ import eu.dnetlib.pace.model.Field; public class ClusteringCombiner { public static Collection combine(final Document a, final Config conf) { - return new ClusteringCombiner().doCombine(a, conf.clusterings()); + return new ClusteringCombiner().doCombine(a, conf); } - private Collection doCombine(final Document a, final List defs) { + private Collection doCombine(final Document a, final Config conf) { final Collection res = Sets.newLinkedHashSet(); - for (final ClusteringDef cd : defs) { + for (final ClusteringDef cd : conf.clusterings()) { for (final String fieldName : cd.getFields()) { final Field values = a.values(fieldName); - res.addAll(cd.clusteringFunction().apply((List) values)); + res.addAll(cd.clusteringFunction().apply(conf, (List) values)); } } return res; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index 4fe1b59..0554d27 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -4,11 +4,12 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; public interface ClusteringFunction { - public Collection apply(List fields); + public Collection apply(Config config, List fields); public Map getParams(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java index fab8e98..7f342f6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -5,6 +5,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("immutablefieldvalue") public class ImmutableFieldValue extends AbstractClusteringFunction { @@ -14,7 +15,7 @@ public class ImmutableFieldValue extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); res.add(s); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 1cabecd..769ecf5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -15,16 +16,16 @@ public class KeywordsClustering extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(final Config conf, String s) { //takes city codes and keywords codes without duplicates - Set keywords = getKeywords(s, params.getOrDefault("windowSize", 4)); + Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); Set cities = getCities(s, params.getOrDefault("windowSize", 4)); //list of combination to return as result final Collection combinations = new LinkedHashSet(); - for (String keyword: keywordsToCodes(keywords)){ + for (String keyword: keywordsToCodes(keywords, conf.translationMap())){ for (String city: citiesToCodes(cities)) { combinations.add(keyword+"-"+city); if (combinations.size()>=params.getOrDefault("max", 2)) { @@ -37,13 +38,13 @@ public class KeywordsClustering extends AbstractClusteringFunction { } @Override - public Collection apply(List fields) { + public Collection apply(final Config conf, List fields) { return fields.stream().filter(f -> !f.isEmpty()) .map(Field::stringValue) .map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here? .map(this::normalize) .map(s -> filterAllStopWords(s)) - .map(this::doApply) + .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) .filter(StringUtils::isNotBlank) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index 5ec8590..6fe525f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -6,6 +6,7 @@ import java.util.Map; import com.google.common.collect.Lists; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -17,16 +18,16 @@ public class LowercaseClustering extends AbstractClusteringFunction { } @Override - public Collection apply(List fields) { + public Collection apply(Config conf, List fields) { Collection c = Sets.newLinkedHashSet(); for(Field f : fields) { - c.addAll(doApply(f.stringValue())); + c.addAll(doApply(conf, f.stringValue())); } return c; } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { if(StringUtils.isBlank(s)) { return Lists.newArrayList(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index 06885be..baa30d7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("ngrampairs") public class NgramPairs extends Ngrams { @@ -15,7 +16,7 @@ public class NgramPairs extends Ngrams { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index 8549468..214b145 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -1,5 +1,7 @@ package eu.dnetlib.pace.clustering; +import eu.dnetlib.pace.config.Config; + import java.util.*; @ClusteringClass("ngrams") @@ -10,7 +12,7 @@ public class Ngrams extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index 718b88d..26b07f0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.clustering; import com.google.common.collect.Sets; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Person; import org.apache.commons.lang.StringUtils; @@ -23,7 +24,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin } @Override - public Collection apply(final List fields) { + public Collection apply(final Config conf, final List fields) { final Set hashes = Sets.newHashSet(); for (final Field f : fields) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index fcb01b9..2020a66 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -6,6 +6,7 @@ import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Person; @ClusteringClass("personhash") @@ -18,7 +19,7 @@ public class PersonHash extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index f012aac..c485fcb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -1,5 +1,7 @@ package eu.dnetlib.pace.clustering; +import eu.dnetlib.pace.config.Config; + import java.util.Collection; import java.util.Map; @@ -10,7 +12,7 @@ public class RandomClusteringFunction extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(final Config conf, String s) { // TODO Auto-generated method stub return null; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index 2f475fe..55b203d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -5,6 +5,7 @@ import java.util.*; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("sortedngrampairs") public class SortedNgramPairs extends NgramPairs { @@ -14,7 +15,7 @@ public class SortedNgramPairs extends NgramPairs { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { final List tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s)); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java index 22dc490..fd8e7a3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -4,6 +4,7 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import eu.dnetlib.pace.config.Config; import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.StringUtils; @@ -17,7 +18,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java index 3960331..fa1f643 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -5,6 +5,7 @@ import java.util.Map; import java.util.Set; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; @ClusteringClass("suffixprefix") public class SuffixPrefix extends AbstractClusteringFunction { @@ -14,7 +15,7 @@ public class SuffixPrefix extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return suffixPrefix(s, param("len"), param("max")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 9955d5f..feb60a2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import java.net.MalformedURLException; @@ -21,7 +22,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu } @Override - public Collection apply(List fields) { + public Collection apply(final Config conf, List fields) { try { return fields.stream() .filter(f -> !f.isEmpty()) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index e453604..23ff7ac 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -13,6 +13,8 @@ import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; +import java.io.IOException; +import java.io.StringWriter; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; @@ -25,7 +27,6 @@ import java.util.stream.Collectors; */ public abstract class AbstractPaceFunctions { - private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv"); private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); @@ -238,10 +239,10 @@ public abstract class AbstractPaceFunctions { } - public double keywordsCompare(Set s1, Set s2){ + public double keywordsCompare(Set s1, Set s2, Map translationMap){ - Set k1 = keywordsToCodes(s1); - Set k2 = keywordsToCodes(s2); + Set k1 = keywordsToCodes(s1, translationMap); + Set k2 = keywordsToCodes(s2, translationMap); int longer = (k1.size()>k2.size())?k1.size():k2.size(); @@ -273,7 +274,7 @@ public abstract class AbstractPaceFunctions { return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); } - public Set keywordsToCodes(Set keywords) { + public Set keywordsToCodes(Set keywords, Map translationMap) { return toCodes(keywords, translationMap); } @@ -324,12 +325,17 @@ public abstract class AbstractPaceFunctions { return codes; } - public Set getKeywords(String s1, int windowSize) { - return getKeywords(s1, translationMap, windowSize); - } - public Set getCities(String s1, int windowSize) { return getKeywords(s1, cityMap, windowSize); } + public static String readFromClasspath(final String filename, final Class clazz) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(clazz.getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 91c1096..2cdace1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -49,4 +49,6 @@ public interface Config { */ public Map> blacklists(); + + public Map translationMap(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index f369df3..beab7a8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -55,6 +55,7 @@ public class DedupConfig implements Config, Serializable { try { config = new ObjectMapper().readValue(json, DedupConfig.class); config.getPace().initModel(); + config.getPace().initTranslationMap(); return config; } catch (IOException e) { throw new PaceException("Error in parsing configuration json", e); @@ -139,4 +140,9 @@ public class DedupConfig implements Config, Serializable { return getPace().getBlacklists(); } + @Override + public Map translationMap() { + return getPace().translationMap(); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 735af2c..1d4a03e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.config; import com.google.common.collect.Maps; + import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; @@ -8,6 +9,7 @@ import eu.dnetlib.pace.util.PaceResolver; import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; +import java.text.Normalizer; import java.util.List; import java.util.Map; @@ -19,6 +21,10 @@ public class PaceConfig implements Serializable { private Map decisionTree; private Map> blacklists; + private Map> synonyms; + + @JsonIgnore + private Map translationMap; @JsonIgnore private Map modelMap; @@ -30,11 +36,26 @@ public class PaceConfig implements Serializable { public void initModel() { modelMap = Maps.newHashMap(); - for(FieldDef fd : getModel()) { + for (FieldDef fd : getModel()) { modelMap.put(fd.getName(), fd); } } + public void initTranslationMap(){ + translationMap = Maps.newHashMap(); + for (String key : synonyms.keySet()) { + for (String term : synonyms.get(key)){ + translationMap.put( + Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD), + key); + } + } + } + + public Map translationMap(){ + return translationMap; + } + public List getModel() { return model; } @@ -67,6 +88,14 @@ public class PaceConfig implements Serializable { this.blacklists = blacklists; } + public Map> getSynonyms() { + return synonyms; + } + + public void setSynonyms(Map> synonyms) { + this.synonyms = synonyms; + } + public Map getModelMap() { return modelMap; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java index ff1cd97..d5a33ea 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -23,7 +24,7 @@ public class AlwaysMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b) { + public double compare(final Field a, final Field b, final Config conf) { return 1.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java index 5a844de..11f628d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -22,7 +23,7 @@ public class ExactMatch extends AbstractComparator { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { return a.equals(b) ? 1.0 : 0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java index d51a1bd..4cfe048 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.tree; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -14,7 +15,7 @@ public class ExactMatchIgnoreCase extends AbstractComparator { } @Override - public double compare(Field a, Field b) { + public double compare(Field a, Field b, final Config conf) { final String fa = getValue(a); final String fb = getValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java index 76f1fd2..0af0a80 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -23,7 +24,7 @@ public class JaroWinkler extends AbstractComparator { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index 85f657f..4c16780 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.config.Config; + import java.util.Map; import java.util.Set; @@ -26,7 +28,7 @@ public class JaroWinklerNormalizedName extends AbstractComparator { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); @@ -36,15 +38,15 @@ public class JaroWinklerNormalizedName extends AbstractComparator { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue()); - Set keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue()); + Set keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); + Set keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); if (sameCity(cities1,cities2)) { - if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { + if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) { ca = removeKeywords(ca, keywords1); ca = removeKeywords(ca, cities1); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java index 23c4cfa..8556eae 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java @@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.config.Config; + import java.util.Map; //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) @@ -23,7 +25,7 @@ public class JaroWinklerTitle extends AbstractComparator { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java index 6ddd2c9..6a19e66 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.config.Config; import java.util.Map; @@ -22,7 +23,7 @@ public class Level2JaroWinklerTitle extends AbstractComparator { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { final String ca = cleanup(a); final String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java index 0d444e1..546284e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.config.Config; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -26,7 +28,7 @@ public class LevensteinTitle extends AbstractComparator { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { final String ca = cleanup(a); final String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java index 9413eb5..83b80b1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.config.Config; + import java.util.Map; @@ -25,7 +27,7 @@ public class LevensteinTitleIgnoreVersion extends AbstractComparator { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java index 4b2e707..01d77b3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.config.Config; import java.util.Map; @@ -22,7 +23,7 @@ public class MustBeDifferent extends AbstractComparator { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { return !a.equals(b) ? 1.0 : 0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java index 47d45a1..e65ac71 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.tree; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.Comparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -17,7 +18,7 @@ public class NullDistanceAlgo implements Comparator { } @Override - public double compare(Field a, Field b) { + public double compare(Field a, Field b, Config config) { return 0; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java index 519c28d..14845da 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.tree; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.adaptor.Pid; @@ -27,7 +28,7 @@ public class PidMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b) { + public double compare(final Field a, final Field b, final Config conf) { final List sa = ((FieldList) a).stringList(); final List sb = ((FieldList) b).stringList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java index e79f918..91f1e35 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java @@ -5,6 +5,7 @@ import java.util.Map; import com.google.common.collect.Iterables; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -28,7 +29,7 @@ public class SizeMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b) { + public double compare(final Field a, final Field b, final Config conf) { if (a.isEmpty() || b.isEmpty()) return -1; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java index 2ea7bd8..004fc90 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java @@ -1,5 +1,8 @@ package eu.dnetlib.pace.tree; +import eu.dnetlib.pace.config.Config; +import org.apache.commons.lang.StringUtils; + import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; @@ -67,9 +70,9 @@ public class SubStringLevenstein extends AbstractComparator { * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override - public double compare(final Field a, final Field b) { + public double distance(final Field a, final Field b, final Config conf) { if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) - return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); + return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf); throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java index 6643262..fd86b17 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -21,7 +22,7 @@ public class TitleVersionMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b) { + public double compare(final Field a, final Field b, final Config conf) { final String valueA = getFirstValue(a); final String valueB = getFirstValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java index 225f9ca..8f36126 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.tree; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.ComparatorClass; import org.apache.commons.lang.StringUtils; @@ -28,8 +29,7 @@ public class UrlMatcher extends Levenstein { } @Override - public double compare(Field a, Field b) { - + public double distance(Field a, Field b, final Config conf) { final URL urlA = asUrl(getFirstValue(a)); final URL urlB = asUrl(getFirstValue(b)); @@ -44,7 +44,7 @@ public class UrlMatcher extends Levenstein { return hostW * 0.5; } - return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath()); + return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf); } private URL asUrl(final String value) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java index fbb0263..f57cba5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.tree; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -22,7 +23,7 @@ public class YearMatch extends AbstractComparator { } @Override - public double compare(final Field a, final Field b) { + public double compare(final Field a, final Field b, final Config conf) { final String valueA = getNumbers(getFirstValue(a)); final String valueB = getNumbers(getFirstValue(b)); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java index f6ad137..1ed14b0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; @@ -66,7 +67,7 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement * the b * @return the double */ - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { if (a.isEmpty() || b.isEmpty()) { return -1; //return -1 if a field is missing @@ -84,16 +85,23 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement * the b * @return the double */ - protected double distance(final List a, final List b) { - return distance(concat(a), concat(b)); + protected double distance(final List a, final List b, final Config conf) { + return distance(concat(a), concat(b), conf); + } + + public double distance(final Field a, final Field b, final Config conf) { + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf); + + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); } @Override - public double compare(final Field a, final Field b) { + public double compare(final Field a, final Field b, final Config conf) { if (a.isEmpty() || b.isEmpty()) return -1; - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue()); - if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b)); + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf); throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java index 64ff4f3..8524996 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.tree.support; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; public interface Comparator { @@ -8,6 +9,6 @@ public interface Comparator { * return : -1 -> can't decide (missing field) * >0 -> similarity degree (depends on the algorithm) * */ - public double compare(Field a, Field b); + public double compare(Field a, Field b, Config conf); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 7294536..1d187b0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.tree.support; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; +import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.map.ObjectMapper; import java.io.IOException; @@ -35,7 +37,7 @@ public class TreeNodeDef implements Serializable { public TreeNodeDef() { } - public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2) { + public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) { TreeNodeStats stats = new TreeNodeStats(); stats.setFieldsCount(fields.size()); @@ -44,7 +46,7 @@ public class TreeNodeDef implements Serializable { double weight = fieldConf.getWeight(); - double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())); + double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); if (result == -1) { //if the field is missing stats.incrementMissCount(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index 794511a..70e9623 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -38,7 +38,7 @@ public class TreeProcessor { if (currentNode == null) throw new PaceException("The Tree Node doesn't exist: " + current); - TreeNodeStats stats = currentNode.evaluate(doc1, doc2); + TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) { current = currentNode.getUndefined(); diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index c74b357..e97fd52 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -1,4 +1,4 @@ -key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti +key::1;university;universita;universita studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο @@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο key::39;bureau;ufficio;bureau;офис;bureau;γραφείο key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία -key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology +key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;universidad politecnica;universitat politecnica;politechnika;politechniki;university technology;university science technology key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 84ec090..6eec8cb 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -1,22 +1,24 @@ package eu.dnetlib.pace.clustering; -import java.util.Map; - import com.google.common.collect.Lists; import com.google.common.collect.Maps; import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.config.DedupConfig; import org.junit.Before; import org.junit.Test; +import java.util.Map; + public class ClusteringFunctionTest extends AbstractPaceTest { private Map params; + DedupConfig conf; @Before public void setUp() throws Exception { params = Maps.newHashMap(); + conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class)); } @Test @@ -26,7 +28,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "http://www.test.it/path/to/resource"; System.out.println(s); - System.out.println(urlClustering.apply(Lists.newArrayList(url(s)))); + System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s)))); } @Test @@ -40,7 +42,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(ngram.apply(Lists.newArrayList(title(s)))); + System.out.println(ngram.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -52,7 +54,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(np.apply(Lists.newArrayList(title(s)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -64,11 +66,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s1 = "University of Pisa"; System.out.println(s1); - System.out.println(np.apply(Lists.newArrayList(title(s1)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s1)))); final String s2 = "Pisa University"; System.out.println(s2); - System.out.println(np.apply(Lists.newArrayList(title(s2)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s2)))); } @Test @@ -81,7 +83,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(acro.apply(Lists.newArrayList(title(s)))); + System.out.println(acro.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -93,7 +95,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -105,7 +107,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -114,7 +116,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = readFromClasspath("gt.author.json"); System.out.println(s); - System.out.println(cf.apply(Lists.newArrayList(person(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(person(s)))); } @Test @@ -123,27 +125,27 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final ClusteringFunction cf = new KeywordsClustering(params); final String s = "Polytechnic University of Turin"; System.out.println(s); - System.out.println(cf.apply(Lists.newArrayList(title(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); final String s1 = "POLITECNICO DI TORINO"; System.out.println(s1); - System.out.println(cf.apply(Lists.newArrayList(title(s1)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); final String s2 = "Universita farmaceutica culturale di milano bergamo"; System.out.println("s2 = " + s2); - System.out.println(cf.apply(Lists.newArrayList(title(s2)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s2)))); final String s3 = "universita universita milano milano"; System.out.println("s3 = " + s3); - System.out.println(cf.apply(Lists.newArrayList(title(s3)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s3)))); final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)"; System.out.println("s4 = " + s4); - System.out.println(cf.apply(Lists.newArrayList(title(s4)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s4)))); final String s5 = "İstanbul Ticarət Universiteti"; System.out.println("s5 = " + s5); - System.out.println(cf.apply(Lists.newArrayList(title(s5)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s5)))); } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java index 1004203..c3dc482 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.comparators; import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.tree.JaroWinklerNormalizedName; +import eu.dnetlib.pace.config.DedupConfig; import org.junit.Before; import org.junit.Test; @@ -17,13 +18,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; private Map params; + private DedupConfig conf; @Before public void setup() { - System.out.println("****************************************************************"); - System.out.println("Test String : " + TEST_STRING); params = new HashMap<>(); params.put("weight", 1.0); + conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class)); } @Test @@ -55,7 +56,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { @Test public void testJaroWinklerNormalizedName() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State"); + double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf); System.out.println("result = " + result); assertEquals(0.0, result); @@ -65,49 +66,49 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName2() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York"); + double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf); - assertEquals(result, 1.0); + assertEquals(1.0, result); } @Test public void testJaroWinklerNormalizedName3() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna"); + double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf); System.out.println("result = " + result); - assertEquals(result, 0.0); + assertEquals(0.0, result); } @Test public void testJaroWinklerNormalizedName4() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa"); + double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf); System.out.println("result = " + result); - assertEquals(result, 1.0); + assertEquals(1.0, result); } @Test public void testJaroWinklerNormalizedName5() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS"); + double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf); System.out.println("result = " + result); - assertEquals(result, 1.0); + assertEquals(1.0, result); } @Test public void testJaroWinklerNormalizedName6() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung"); + double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf); System.out.println("result = " + result); - assertTrue(result> 0.9); + assertTrue(result > 0.9); } @@ -115,17 +116,17 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName7() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO"); + double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf); System.out.println("result = " + result); - assertTrue(result> 0.9); + assertTrue(result > 0.9); } @Test public void testJaroWinklerNormalizedName8() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology"); + double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf); System.out.println("result = " + result); } @@ -134,7 +135,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName9() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti"); + double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf); System.out.println("result = " + result); } @@ -144,7 +145,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence"); + double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf); System.out.println("result = " + result); } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 575b3c7..9051049 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -5,12 +5,13 @@ import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; public class ConfigTest extends AbstractPaceTest { @Test public void dedupConfigSerializationTest() { - final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json")); + final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf")); final String conf = cfgFromClasspath.toString(); @@ -37,4 +38,20 @@ public class ConfigTest extends AbstractPaceTest { System.out.println(load.toString()); } + @Test + public void translationMapTest() { + + DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf")); + + System.out.println("translationMap = " + load.getPace().translationMap().toString()); + } + + @Test + public void emptyTranslationMapTest() { + + DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf")); + + assertEquals(0, load.getPace().translationMap().keySet().size()); + } + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf index a5b28ce..017cbab 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf @@ -5,32 +5,152 @@ "entityType" : "organization", "orderField" : "legalname", "queueMaxSize" : "2000", - "groupMaxSize" : "10", + "groupMaxSize" : "50", "slidingWindowSize" : "200", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true" }, "pace" : { "clustering" : [ +<<<<<<< HEAD { "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, { "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } } +======= + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } +>>>>>>> origin/master ], "sufficientConditions" : [ { "name" : "exactMatch", "fieldsCount" : [ "gridid" ] } ], +<<<<<<< HEAD "necessaryConditions" : [ { "name" : "exactMatch", "fieldsCount" : [ "country" ] }, { "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] } +======= + "conditions" : [ + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, + { "name" : "exactMatch", "fields" : [ "country" ] } +>>>>>>> origin/master ], "model" : [ - { "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } ], - "blacklists" : { } + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"], + "key::102": ["informatics","informatica","informática","informática","informatica"], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] + } } } \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf new file mode 100644 index 0000000..3af6585 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf @@ -0,0 +1,40 @@ +{ + "wf" : { + "threshold" : "0.9", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "50", + "slidingWindowSize" : "200", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "gridid" ] } + ], + "conditions" : [ + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, + { "name" : "exactMatch", "fields" : [ "country" ] } + ], + "model" : [ + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + ], + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + } + } +} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 57ca8d6..4e9d3fe 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.14-SNAPSHOT + 3.0.15-SNAPSHOT pom diff --git a/release.properties b/release.properties deleted file mode 100644 index 5c101a5..0000000 --- a/release.properties +++ /dev/null @@ -1,11 +0,0 @@ -#release configuration -#Mon Jul 08 10:03:15 CEST 2019 -scm.tagNameFormat=@{project.artifactId}-@{project.version} -pushChanges=true -scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git -preparationGoals=clean verify -projectVersionPolicyId=default -remoteTagging=true -scm.commentPrefix=[maven-release-plugin] -exec.snapshotReleasePluginAllowed=false -completedPhase=create-backup-poms