From 76efcde4fddadb75b64ce102bb57adaf2ea802d1 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Fri, 13 Dec 2019 12:20:35 +0100 Subject: [PATCH] using new branch decisionTreeDedup --- .../eu/dnetlib/dedup/DedupRecordFactory.java | 5 + .../dnetlib/dedup/SparkCreateDedupTest.java | 4 +- .../eu/dnetlib/dedup/conf/org.curr.conf.json | 1972 ++--------------- .../dnetlib/dedup/conf/pub_dt.curr.conf.json | 334 +++ 4 files changed, 575 insertions(+), 1740 deletions(-) create mode 100644 dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java index 704ef918d..ed3ac8231 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java @@ -221,6 +221,11 @@ public class DedupRecordFactory { throw new RuntimeException(exc); } }); + + if (o.getDataInfo() == null) + { + o.setDataInfo(new DataInfo()); + } o.getDataInfo().setTrust("0.9"); o.setLastupdatetimestamp(ts); diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java index 7aa8a4302..19522e275 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java @@ -16,11 +16,11 @@ import java.util.List; public class SparkCreateDedupTest { String configuration; - String entity = "organization"; + String entity = "publication"; @Before public void setUp() throws IOException { - configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); + configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json")); } diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json index 2aeb4c582..43003c2e0 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json @@ -1,1753 +1,249 @@ { - "wf": { - "threshold": "0.9", - "dedupRun": "001", - "entityType": "organization", - "orderField": "legalname", - "queueMaxSize": "2000", - "groupMaxSize": "50", - "slidingWindowSize": "200", + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "50", + "slidingWindowSize" : "200", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true", "idPath": "$.id", - "rootBuilder": [ - "organization", - "projectOrganization_participation_isParticipant", - "datasourceOrganization_provision_isProvidedBy" - ], - "includeChildren": "true" + "maxIterations": "20" }, - "pace": { - "clustering": [ - { - "name": "sortedngrampairs", - "fields": [ - "legalname" - ], - "params": { - "max": 2, - "ngramLen": "3" - } - }, - { - "name": "suffixprefix", - "fields": [ - "legalname" - ], - "params": { - "max": 1, - "len": "3" - } - }, - { - "name": "urlclustering", - "fields": [ - "websiteurl" - ], - "params": {} - }, - { - "name": "keywordsclustering", - "fields": [ - "legalname" - ], - "params": { - "max": 2, - "windowSize": 4 - } - } + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } ], - "strictConditions": [ - { - "name": "exactMatch", + "decisionTree" : { + "start": { "fields": [ - "gridid" - ] - } - ], - "conditions": [ - { - "name": "DomainExactMatch", + { + "field": "gridid", + "comparator": "exactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1, + "aggregation": "SC", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "layer2", + "ignoreUndefined": "false" + }, + "layer2": { "fields": [ - "websiteurl" - ] + { + "field": "websiteurl", + "comparator": "domainExactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "country", + "comparator": "exactMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 1, + "aggregation": "NC", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "true" }, - { - "name": "exactMatch", + "layer3": { "fields": [ - "country" - ] + { + "field": "legalname", + "comparator": "cityMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "windowSize": "4", + "threshold": "0.0" + } + } + ], + "threshold": 1.0, + "aggregation": "W_MEAN", + "positive": "layer4", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + }, + "layer4": { + "fields": [ + { + "field": "legalname", + "comparator": "keywordMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "windowSize": "4", + "threshold": "0.7" + } + } + ], + "threshold": 1.0, + "aggregation": "W_MEAN", + "positive": "layer5", + "negative": "NO_MATCH", + "undefined": "layer5", + "ignoreUndefined": "false" + }, + "layer5": { + "fields": [ + { + "field": "legalname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.9, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + }, + { + "field": "legalshortname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 0.9, + "aggregation": "W_MEAN", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" } + }, + "model" : [ + { "name" : "country", "type" : "String", "path" : "$.country.classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"} ], - "model": [ - { - "name": "country", - "algo": "Null", - "type": "String", - "weight": "0", - "ignoreMissing": "false", - "path": "$.country.classid" - }, - { - "name": "legalshortname", - "algo": "JaroWinklerNormalizedName", - "type": "String", - "weight": "0.1", - "ignoreMissing": "true", - "path": "$.legalshortname.value" - }, - { - "name": "legalname", - "algo": "JaroWinklerNormalizedName", - "type": "String", - "weight": "0.9", - "ignoreMissing": "false", - "path": "$.legalname.value", - "params": { - "windowSize": 4, - "threshold": 0.7 - } - }, - { - "name": "websiteurl", - "algo": "Null", - "type": "URL", - "weight": "0", - "ignoreMissing": "true", - "path": "$.websiteurl.value", - "params": { - "host": 0.5, - "path": 0.5 - } - }, - { - "name": "gridid", - "algo": "Null", - "type": "String", - "weight": "0.0", - "ignoreMissing": "true", - "path": "$.pid[?(@.qualifier.classid ==\"grid\")].value" - } - ], - "blacklists": { - "legalname": [] + "blacklists" : { + "legalname" : [] }, "synonyms": { - "key::1": [ - "university", - "università", - "università studi", - "universitario", - "universitaria", - "université", - "universitaire", - "universitaires", - "universidad", - "universitade", - "Universität", - "universitaet", - "Uniwersytet", - "университет", - "universiteit", - "πανεπιστήμιο", - "universitesi", - "universiteti" - ], - "key::2": [ - "studies", - "studi", - "études", - "estudios", - "estudos", - "Studien", - "studia", - "исследования", - "studies", - "σπουδές" - ], - "key::3": [ - "advanced", - "superiore", - "supérieur", - "supérieure", - "supérieurs", - "supérieures", - "avancado", - "avancados", - "fortgeschrittene", - "fortgeschritten", - "zaawansowany", - "передовой", - "gevorderd", - "gevorderde", - "προχωρημένος", - "προχωρημένη", - "προχωρημένο", - "προχωρημένες", - "προχωρημένα", - "wyzsza" - ], - "key::4": [ - "institute", - "istituto", - "institut", - "instituto", - "instituto", - "Institut", - "instytut", - "институт", - "instituut", - "ινστιτούτο" - ], - "key::5": [ - "hospital", - "ospedale", - "hôpital", - "hospital", - "hospital", - "Krankenhaus", - "szpital", - "больница", - "ziekenhuis", - "νοσοκομείο" - ], - "key::6": [ - "research", - "ricerca", - "recherche", - "investigacion", - "pesquisa", - "Forschung", - "badania", - "исследования", - "onderzoek", - "έρευνα", - "erevna", - "erevnas" - ], - "key::7": [ - "college", - "collegio", - "université", - "colegio", - "faculdade", - "Hochschule", - "Szkoła Wyższa", - "Высшая школа", - "universiteit", - "κολλέγιο" - ], - "key::8": [ - "foundation", - "fondazione", - "fondation", - "fundación", - "fundação", - "Stiftung", - "Fundacja", - "фонд", - "stichting", - "ίδρυμα", - "idryma" - ], - "key::9": [ - "center", - "centro", - "centre", - "centro", - "centro", - "zentrum", - "centrum", - "центр", - "centrum", - "κέντρο" - ], - "key::10": [ - "national", - "nazionale", - "national", - "nationale", - "nationaux", - "nationales", - "nacional", - "nacional", - "national", - "krajowy", - "национальный", - "nationaal", - "nationale", - "εθνικό" - ], - "key::11": [ - "association", - "associazione", - "association", - "asociación", - "associação", - "Verein", - "verband", - "stowarzyszenie", - "ассоциация", - "associatie" - ], - "key::12": [ - "society", - "societa", - "société", - "sociedad", - "sociedade", - "gesellschaft", - "społeczeństwo", - "общество", - "maatschappij", - "κοινωνία" - ], - "key::13": [ - "international", - "internazionale", - "international", - "internacional", - "internacional", - "international", - "międzynarodowy", - "Международный", - "internationaal", - "internationale", - "διεθνής", - "διεθνή", - "διεθνές" - ], - "key::14": [ - "community", - "comunita", - "communauté", - "comunidad", - "comunidade", - "Gemeinschaft", - "społeczność", - "сообщество", - "gemeenschap", - "κοινότητα" - ], - "key::15": [ - "school", - "scuola", - "école", - "escuela", - "escola", - "schule", - "Szkoła", - "школа", - "school", - "σχολείο" - ], - "key::16": [ - "education", - "educazione", - "éducation", - "educacion", - "Educação", - "Bildung", - "Edukacja", - "образование", - "opleiding", - "εκπαίδευση" - ], - "key::17": [ - "academy", - "accademia", - "académie", - "academia", - "academia", - "Akademie", - "akademie", - "академия", - "academie", - "ακαδημία" - ], - "key::18": [ - "public", - "pubblico", - "public", - "publique", - "publics", - "publiques", - "publico", - "publico", - "Öffentlichkeit", - "publiczny", - "публичный", - "publiek", - "publieke", - "δημόσιος", - "δημόσια", - "δημόσιο" - ], - "key::19": [ - "museum", - "museo", - "musée", - "mueso", - "museu", - "museum", - "muzeum", - "музей", - "museum", - "μουσείο" - ], - "key::20": [ - "group", - "gruppo", - "groupe", - "grupo", - "grupo", - "gruppe", - "grupa", - "группа", - "groep", - "ομάδα", - "όμιλος" - ], - "key::21": [ - "department", - "dipartimento", - "département", - "departamento", - "departamento", - "abteilung", - "departament", - "отдел", - "afdeling", - "τμήμα" - ], - "key::22": [ - "council", - "consiglio", - "conseil", - "Consejo", - "conselho", - "gesellschaft", - "rada", - "совет", - "raad", - "συμβούλιο" - ], - "key::23": [ - "library", - "biblioteca", - "bibliothèque", - "biblioteca", - "biblioteca", - "Bibliothek", - "biblioteka", - "библиотека", - "bibliotheek", - "βιβλιοθήκη" - ], - "key::24": [ - "ministry", - "ministero", - "ministère", - "ministerio", - "ministério", - "Ministerium", - "ministerstwo", - "министерство", - "ministerie", - "υπουργείο" - ], - "key::25": [ - "services", - "servizi", - "services", - "servicios", - "Serviços", - "Dienstleistungen", - "usługi", - "услуги", - "diensten", - "υπηρεσίες" - ], - "key::26": [ - "central", - "centrale", - "central", - "centrale", - "centrales", - "central", - "central", - "zentral", - "centralny", - "цетральный", - "centraal", - "κεντρικός", - "κεντρική", - "κεντρικό", - "κεντρικά" - ], - "key::27": [ - "general", - "generale", - "général", - "générale", - "généraux", - "générales", - "general", - "geral", - "general", - "Allgemeines", - "general", - "общий", - "algemeen", - "algemene", - "γενικός", - "γενική", - "γενικό", - "γενικά" - ], - "key::28": [ - "applied", - "applicati", - "appliqué", - "appliquée", - "appliqués", - "appliquées", - "aplicado", - "aplicada", - "angewendet", - "stosowany", - "прикладной", - "toegepast", - "toegepaste", - "εφαρμοσμένος", - "εφαρμοσμένη", - "εφαρμοσμένο", - "εφαρμοσμένα" - ], - "key::29": [ - "european", - "europee", - "europea", - "européen", - "européenne", - "européens", - "européennes", - "europeo", - "europeu", - "europäisch", - "europejski", - "европейский", - "Europees", - "Europese", - "ευρωπαϊκός", - "ευρωπαϊκή", - "ευρωπαϊκό", - "ευρωπαϊκά" - ], - "key::30": [ - "agency", - "agenzia", - "agence", - "agencia", - "agencia", - "agentur", - "agencja", - "агенция", - "agentschap", - "πρακτορείο" - ], - "key::31": [ - "laboratory", - "laboratorio", - "laboratoire", - "laboratorio", - "laboratorio", - "labor", - "laboratorium", - "лаборатория", - "laboratorium", - "εργαστήριο" - ], - "key::32": [ - "industry", - "industria", - "industrie", - "индустрия", - "industrie", - "βιομηχανία" - ], - "key::33": [ - "industrial", - "industriale", - "industriel", - "industrielle", - "industriels", - "industrielles", - "индустриальный", - "industrieel", - "βιομηχανικός", - "βιομηχανική", - "βιομηχανικό", - "βιομηχανικά", - "βιομηχανικές" - ], - "key::34": [ - "consortium", - "consorzio", - "consortium", - "консорциум", - "consortium", - "κοινοπραξία" - ], - "key::35": [ - "organization", - "organizzazione", - "organisation", - "organización", - "organização", - "organizacja", - "организация", - "organisatie", - "οργανισμός" - ], - "key::36": [ - "authority", - "autorità", - "autorité", - "авторитет", - "autoriteit" - ], - "key::37": [ - "federation", - "federazione", - "fédération", - "федерация", - "federatie", - "ομοσπονδία" - ], - "key::38": [ - "observatory", - "osservatorio", - "observatoire", - "обсерватория", - "observatorium", - "αστεροσκοπείο" - ], - "key::39": [ - "bureau", - "ufficio", - "bureau", - "офис", - "bureau", - "γραφείο" - ], - "key::40": [ - "company", - "impresa", - "compagnie", - "société", - "компания", - "bedrijf", - "εταιρία" - ], - "key::41": [ - "polytechnic", - "politecnico", - "polytechnique", - "политехника", - "polytechnisch", - "πολυτεχνείο", - "universita politecnica", - "polytechnic university", - "universidad politecnica", - "universitat politecnica", - "politechnika", - "politechniki", - "university technology", - "university science technology" - ], - "key::42": [ - "coalition", - "coalizione", - "coalition", - "коалиция", - "coalitie", - "συνασπισμός" - ], - "key::43": [ - "initiative", - "iniziativa", - "initiative", - "инициатива", - "initiatief", - "πρωτοβουλία" - ], - "key::44": [ - "academic", - "accademico", - "académique", - "universitaire", - "акадеческий academisch", - "ακαδημαϊκός", - "ακαδημαϊκή", - "ακαδημαϊκό", - "ακαδημαϊκές", - "ακαδημαϊκοί" - ], - "key::45": [ - "institution", - "istituzione", - "institution", - "институциональный", - "instelling", - "ινστιτούτο" - ], - "key::46": [ - "division", - "divisione", - "division", - "отделение", - "divisie", - "τμήμα" - ], - "key::47": [ - "committee", - "comitato", - "comité", - "комитет", - "commissie", - "επιτροπή" - ], - "key::48": [ - "promotion", - "promozione", - "продвижение", - "proothisis", - "forderung" - ], - "key::49": [ - "medical", - "medicine", - "clinical", - "medicina", - "clinici", - "médico", - "medicina", - "clínica", - "médico", - "medicina", - "clínica", - "medizinisch", - "Medizin", - "klinisch", - "medisch", - "geneeskunde", - "klinisch", - "ιατρικός", - "ιατρική", - "ιατρικό", - "ιατρικά", - "κλινικός", - "κλινική", - "κλινικό", - "κλινικά", - "tıbbi", - "tıp", - "klinik", - "orvosi", - "orvostudomány", - "klinikai", - "zdravniški", - "medicinski", - "klinični", - "meditsiini", - "kliinik", - "kliiniline" - ], - "key::50": [ - "technology", - "technological", - "tecnologia", - "tecnologie", - "tecnología", - "tecnológico", - "tecnologia", - "tecnológico", - "Technologie", - "technologisch", - "technologie", - "technologisch", - "τεχνολογία", - "τεχνολογικός", - "τεχνολογική", - "τεχνολογικό", - "teknoloji", - "teknolojik", - "technológia", - "technológiai", - "tehnologija", - "tehnološki", - "tehnoloogia", - "tehnoloogiline", - "technologii", - "technical", - "texniki", - "teknik" - ], - "key::51": [ - "science", - "scientific", - "scienza", - "scientifiche", - "scienze", - "ciencia", - "científico", - "ciência", - "científico", - "Wissenschaft", - "wissenschaftlich", - "wetenschap", - "wetenschappelijk", - "επιστήμη", - "επιστημονικός", - "επιστημονική", - "επιστημονικό", - "επιστημονικά", - "bilim", - "bilimsel", - "tudomány", - "tudományos", - "znanost", - "znanstveni", - "teadus", - "teaduslik", - "" - ], - "key::52": [ - "engineering", - "ingegneria", - "ingeniería", - "engenharia", - "Ingenieurwissenschaft", - "ingenieurswetenschappen", - "bouwkunde", - "μηχανικός", - "μηχανική", - "μηχανικό", - "mühendislik", - "mérnöki", - "Inženirstvo", - "inseneeria", - "inseneri", - "" - ], - "key::53": [ - "management", - "gestione", - "gestionale", - "gestionali", - "gestión", - "administración", - "gestão", - "administração", - "Verwaltung", - "management", - "διαχείριση", - "yönetim", - "menedzsment", - "vodstvo", - "upravljanje", - "management", - "juhtkond", - "juhtimine", - "haldus", - "" - ], - "key::54": [ - "energy", - "energia", - "energía", - "energia", - "Energie", - "energie", - "ενέργεια", - "enerji", - "energia", - "energija", - "energia", - "" - ], - "key::55": [ - "agricultural", - "agriculture", - "agricoltura", - "agricole", - "agrícola", - "agricultura", - "agrícola", - "agricultura", - "landwirtschaftlich", - "Landwirtschaft", - "landbouwkundig", - "landbouw", - "αγροτικός", - "αγροτική", - "αγροτικό", - "γεωργικός", - "γεωργική", - "γεωργικό", - "γεωργία", - "tarımsal", - "tarım", - "mezőgazdasági", - "mezőgazdaság", - "poljedelski", - "poljedelstvo", - "põllumajandus", - "põllumajanduslik", - "" - ], - "key::56": [ - "information", - "informazione", - "información", - "informação", - "Information", - "informatie", - "πληροφορία", - "bilgi", - "információ", - "informacija", - "informatsioon", - "informatycznych", - "" - ], - "key::57": [ - "social", - "sociali", - "social", - "social", - "Sozial", - "sociaal", - "maatschappelijk", - "κοινωνικός", - "κοινωνική", - "κοινωνικό", - "κοινωνικά", - "sosyal", - "szociális", - "družbeni", - "sotsiaal", - "sotsiaalne", - "" - ], - "key::58": [ - "environmental", - "ambiente", - "medioambiental", - "ambiente", - "medioambiente", - "meioambiente", - "Umwelt", - "milieu", - "milieuwetenschap", - "milieukunde", - "περιβαλλοντικός", - "περιβαλλοντική", - "περιβαλλοντικό", - "περιβαλλοντικά", - "çevre", - "környezeti", - "okoliški", - "keskonna", - "" - ], - "key::59": [ - "business", - "economia", - "economiche", - "economica", - "negocio", - "empresa", - "negócio", - "Unternehmen", - "bedrijf", - "bedrijfskunde", - "επιχείρηση", - "iş", - "üzleti", - "posel", - "ettevõte/äri", - "" - ], - "key::60": [ - "pharmaceuticals", - "pharmacy", - "farmacia", - "farmaceutica", - "farmacéutica", - "farmacia", - "farmacêutica", - "farmácia", - "Pharmazeutika", - "Arzneimittelkunde", - "farmaceutica", - "geneesmiddelen", - "apotheek", - "φαρμακευτικός", - "φαρμακευτική", - "φαρμακευτικό", - "φαρμακευτικά", - "φαρμακείο", - "ilaç", - "eczane", - "gyógyszerészeti", - "gyógyszertár", - "farmacevtika", - "lekarništvo", - "farmaatsia", - "farmatseutiline", - "" - ], - "key::61": [ - "healthcare", - "health services", - "salute", - "atenciónmédica", - "cuidadodelasalud", - "cuidadoscomasaúde", - "Gesundheitswesen", - "gezondheidszorg", - "ιατροφαρμακευτικήπερίθαλψη", - "sağlıkhizmeti", - "egészségügy", - "zdravstvo", - "tervishoid", - "tervishoiu", - "" - ], - "key::62": [ - "history", - "storia", - "historia", - "história", - "Geschichte", - "geschiedenis", - "geschiedkunde", - "ιστορία", - "tarih", - "történelem", - "zgodovina", - "ajalugu", - "" - ], - "key::63": [ - "materials", - "materiali", - "materia", - "materiales", - "materiais", - "materialen", - "υλικά", - "τεκμήρια", - "malzemeler", - "anyagok", - "materiali", - "materjalid", - "vahendid", - "" - ], - "key::64": [ - "economics", - "economia", - "economiche", - "economica", - "economía", - "economia", - "Wirtschaft", - "economie", - "οικονομικά", - "οικονομικέςεπιστήμες", - "ekonomi", - "közgazdaságtan", - "gospodarstvo", - "ekonomija", - "majanduslik", - "majandus", - "" - ], - "key::65": [ - "therapeutics", - "terapeutica", - "terapéutica", - "terapêutica", - "therapie", - "θεραπευτική", - "tedavibilimi", - "gyógykezelés", - "terapevtika", - "terapeutiline", - "ravi", - "" - ], - "key::66": [ - "oncology", - "oncologia", - "oncologico", - "oncología", - "oncologia", - "Onkologie", - "oncologie", - "ογκολογία", - "onkoloji", - "onkológia", - "onkologija", - "onkoloogia", - "" - ], - "key::67": [ - "natural", - "naturali", - "naturale", - "natural", - "natural", - "natürlich", - "natuurlijk", - "φυσικός", - "φυσική", - "φυσικό", - "φυσικά", - "doğal", - "természetes", - "naraven", - "loodus", - "" - ], - "key::68": [ - "educational", - "educazione", - "pedagogia", - "educacional", - "educativo", - "educacional", - "pädagogisch", - "educatief", - "εκπαιδευτικός", - "εκπαιδευτική", - "εκπαιδευτικό", - "εκπαιδευτικά", - "eğitimsel", - "oktatási", - "izobraževalen", - "haridus", - "hariduslik", - "" - ], - "key::69": [ - "biomedical", - "biomedica", - "biomédico", - "biomédico", - "biomedizinisch", - "biomedisch", - "βιοιατρικός", - "βιοιατρική", - "βιοιατρικό", - "βιοιατρικά", - "biyomedikal", - "orvosbiológiai", - "biomedicinski", - "biomeditsiiniline", - "" - ], - "key::70": [ - "veterinary", - "veterinaria", - "veterinarie", - "veterinaria", - "veterinária", - "tierärtzlich", - "veterinair", - "veeartsenijlkunde", - "κτηνιατρικός", - "κτηνιατρική", - "κτηνιατρικό", - "κτηνιατρικά", - "veteriner", - "állatorvosi", - "veterinar", - "veterinarski", - "veterinaaria", - "" - ], - "key::71": [ - "chemistry", - "chimica", - "química", - "química", - "Chemie", - "chemie", - "scheikunde", - "χημεία", - "kimya", - "kémia", - "kemija", - "keemia", - "" - ], - "key::72": [ - "security", - "sicurezza", - "seguridad", - "segurança", - "Sicherheit", - "veiligheid", - "ασφάλεια", - "güvenlik", - "biztonsági", - "varnost", - "turvalisus", - "julgeolek", - "" - ], - "key::73": [ - "biotechnology", - "biotecnologia", - "biotecnologie", - "biotecnología", - "biotecnologia", - "Biotechnologie", - "biotechnologie", - "βιοτεχνολογία", - "biyoteknoloji", - "biotechnológia", - "biotehnologija", - "biotehnoloogia", - "" - ], - "key::74": [ - "military", - "militare", - "militari", - "militar", - "militar", - "Militär", - "militair", - "leger", - "στρατιωτικός", - "στρατιωτική", - "στρατιωτικό", - "στρατιωτικά", - "askeri", - "katonai", - "vojaški", - "vojni", - "militaar", - "wojskowa", - "" - ], - "key::75": [ - "theological", - "teologia", - "teologico", - "teológico", - "tecnológica", - "theologisch", - "theologisch", - "θεολογικός", - "θεολογική", - "θεολογικό", - "θεολογικά", - "teolojik", - "technológiai", - "teološki", - "teoloogia", - "usuteadus", - "teoloogiline", - "" - ], - "key::76": [ - "electronics", - "elettronica", - "electrónica", - "eletrônicos", - "Elektronik", - "elektronica", - "ηλεκτρονική", - "elektronik", - "elektronika", - "elektronika", - "elektroonika", - "" - ], - "key::77": [ - "forestry", - "forestale", - "forestali", - "silvicultura", - "forestal", - "floresta", - "Forstwirtschaft", - "bosbouw", - "δασοκομία", - "δασολογία", - "ormancılık", - "erdészet", - "gozdarstvo", - "metsandus", - "" - ], - "key::78": [ - "maritime", - "marittima", - "marittime", - "marittimo", - "marítimo", - "marítimo", - "maritiem", - "ναυτικός", - "ναυτική", - "ναυτικό", - "ναυτικά", - "ναυτιλιακός", - "ναυτιλιακή", - "ναυτιλιακό", - "ναυτιλιακά", - "θαλάσσιος", - "θαλάσσια", - "θαλάσσιο", - "denizcilik", - "tengeri", - "morski", - "mere", - "merendus", - "" - ], - "key::79": [ - "sports", - "sport", - "deportes", - "esportes", - "Sport", - "sport", - "sportwetenschappen", - "άθληση", - "γυμναστικήδραστηριότητα", - "spor", - "sport", - "šport", - "sport", - "spordi", - "" - ], - "key::80": [ - "surgery", - "chirurgia", - "chirurgiche", - "cirugía", - "cirurgia", - "Chirurgie", - "chirurgie", - "heelkunde", - "εγχείρηση", - "επέμβαση", - "χειρουργικήεπέμβαση", - "cerrahi", - "sebészet", - "kirurgija", - "kirurgia", - "" - ], - "key::81": [ - "cultural", - "culturale", - "culturali", - "cultura", - "cultural", - "cultural", - "kulturell", - "cultureel", - "πολιτιστικός", - "πολιτιστική", - "πολιτιστικό", - "πολιτισμικός", - "πολιτισμική", - "πολιτισμικό", - "kültürel", - "kultúrális", - "kulturni", - "kultuuri", - "kultuuriline", - "" - ], - "key::82": [ - "computerscience", - "informatica", - "ordenador", - "computadora", - "informática", - "computación", - "cienciasdelacomputación", - "ciênciadacomputação", - "Computer", - "computer", - "υπολογιστής", - "ηλεκτρονικόςυπολογιστής", - "bilgisayar", - "számítógép", - "računalnik", - "arvuti", - "" - ], - "key::83": [ - "finance", - "financial", - "finanza", - "finanziarie", - "finanza", - "financiero", - "finanças", - "financeiro", - "Finanzen", - "finanziell", - "financiën", - "financieel", - "χρηματοοικονομικά", - "χρηματοδότηση", - "finanse", - "finansal", - "pénzügy", - "pénzügyi", - "finance", - "finančni", - "finants", - "finantsiline", - "" - ], - "key::84": [ - "communication", - "comunicazione", - "comuniciación", - "comunicação", - "Kommunikation", - "communication", - "επικοινωνία", - "iletişim", - "kommunikáció", - "komuniciranje", - "kommunikatsioon", - "" - ], - "key::85": [ - "justice", - "giustizia", - "justicia", - "justiça", - "Recht", - "Justiz", - "justitie", - "gerechtigheid", - "δικαιοσύνη", - "υπουργείοδικαιοσύνης", - "δίκαιο", - "adalet", - "igazságügy", - "pravo", - "õigus", - "" - ], - "key::86": [ - "aerospace", - "aerospaziale", - "aerospaziali", - "aeroespacio", - "aeroespaço", - "Luftfahrt", - "luchtvaart", - "ruimtevaart", - "αεροπορικός", - "αεροπορική", - "αεροπορικό", - "αεροναυπηγικός", - "αεροναυπηγική", - "αεροναυπηγικό", - "αεροναυπηγικά", - "havacılıkveuzay", - "légtér", - "zrakoplovstvo", - "atmosfäär", - "kosmos", - "" - ], - "key::87": [ - "dermatology", - "dermatologia", - "dermatología", - "dermatologia", - "Dermatologie", - "dermatologie", - "δρματολογία", - "dermatoloji", - "bőrgyógyászat", - "dermatológia", - "dermatologija", - "dermatoloogia", - "" - ], - "key::88": [ - "architecture", - "architettura", - "arquitectura", - "arquitetura", - "Architektur", - "architectuur", - "αρχιτεκτονική", - "mimarlık", - "építészet", - "arhitektura", - "arhitektuur", - "" - ], - "key::89": [ - "mathematics", - "matematica", - "matematiche", - "matemáticas", - "matemáticas", - "Mathematik", - "wiskunde", - "mathematica", - "μαθηματικά", - "matematik", - "matematika", - "matematika", - "matemaatika", - "" - ], - "key::90": [ - "language", - "lingue", - "linguistica", - "linguistiche", - "lenguaje", - "idioma", - "língua", - "idioma", - "Sprache", - "taal", - "taalkunde", - "γλώσσα", - "dil", - "nyelv", - "jezik", - "keel", - "" - ], - "key::91": [ - "neuroscience", - "neuroscienza", - "neurociencia", - "neurociência", - "Neurowissenschaft", - "neurowetenschappen", - "νευροεπιστήμη", - "nörobilim", - "idegtudomány", - "nevroznanost", - "neuroteadused", - "" - ], - "key::92": [ - "automation", - "automazione", - "automatización", - "automação", - "Automatisierung", - "automatisering", - "αυτοματοποίηση", - "otomasyon", - "automatizálás", - "avtomatizacija", - "automatiseeritud", - "" - ], - "key::93": [ - "pediatric", - "pediatria", - "pediatriche", - "pediatrico", - "pediátrico", - "pediatría", - "pediátrico", - "pediatria", - "pädiatrisch", - "pediatrische", - "παιδιατρική", - "pediatrik", - "gyermekgyógyászat", - "pediatrija", - "pediaatria", - "" - ], - "key::94": [ - "photonics", - "fotonica", - "fotoniche", - "fotónica", - "fotônica", - "Photonik", - "fotonica", - "φωτονική", - "fotonik", - "fotonika", - "fotonika", - "fotoonika", - "" - ], - "key::95": [ - "mechanics", - "meccanica", - "meccaniche", - "mecánica", - "mecânica", - "Mechanik", - "Maschinenbau", - "mechanica", - "werktuigkunde", - "μηχανικής", - "mekanik", - "gépészet", - "mehanika", - "mehaanika", - "" - ], - "key::96": [ - "psychiatrics", - "psichiatria", - "psichiatrica", - "psichiatriche", - "psiquiatría", - "psiquiatria", - "Psychiatrie", - "psychiatrie", - "ψυχιατρική", - "psikiyatrik", - "pszihiátria", - "psihiatrija", - "psühhaatria", - "" - ], - "key::97": [ - "psychology", - "fisiologia", - "psicología", - "psicologia", - "Psychologie", - "psychologie", - "ψυχολογία", - "psikoloji", - "pszihológia", - "psihologija", - "psühholoogia", - "" - ], - "key::98": [ - "automotive", - "industriaautomobilistica", - "industriadelautomóvil", - "automotriz", - "industriaautomotriz", - "automotivo", - "Automobilindustrie", - "autoindustrie", - "αυτοκίνητος", - "αυτοκίνητη", - "αυτοκίνητο", - "αυτοκινούμενος", - "αυτοκινούμενη", - "αυτοκινούμενο", - "αυτοκινητιστικός", - "αυτοκινητιστική", - "αυτοκινητιστικό", - "otomotiv", - "autóipari", - "samogiben", - "avtomobilskaindustrija", - "auto-", - "" - ], - "key::99": [ - "neurology", - "neurologia", - "neurologiche", - "neurología", - "neurologia", - "Neurologie", - "neurologie", - "zenuwleer", - "νευρολογία", - "nöroloji", - "neurológia", - "ideggyógyászat", - "nevrologija", - "neuroloogia", - "" - ], - "key::100": [ - "geology", - "geologia", - "geologiche", - "geología", - "geologia", - "Geologie", - "geologie", - "aardkunde", - "γεωλογία", - "jeoloji", - "geológia", - "földtudomány", - "geologija", - "geoloogia", - "" - ], - "key::101": [ - "microbiology", - "microbiologia", - "micro-biologia", - "microbiologiche", - "microbiología", - "microbiologia", - "Mikrobiologie", - "microbiologie", - "μικροβιολογία", - "mikrobiyoloji", - "mikrobiológia", - "mikrobiologija", - "mikrobioloogia", - "" - ], - "key::102": [ - "informatics", - "informatica", - "informática", - "informática", - "informatica", - "" - ], - "key::103": [ - "forschungsgemeinschaft", - "comunita ricerca", - "research community", - "research foundation", - "research association" - ], - "key::104": [ - "commerce", - "ticaret", - "ticarət", - "commercio", - "trade", - "handel", - "comercio" - ] + "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""], + "key::102": ["informatics","informatica","informática","informática","informatica",""], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], + "key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"] } } } \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json new file mode 100644 index 000000000..568b0e962 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json @@ -0,0 +1,334 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "subEntityType" : "resulttype", + "subEntityValue" : "publication", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "100", + "maxChildren" : "100", + "idPath": "$.id", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } + ], + + "decisionTree" : { + "start": { + "fields": [ + { + "field": "pid", + "comparator": "pidMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1, + "aggregation": "SC", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "layer2", + "ignoreUndefined": "false" + }, + "layer2": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + }, + { + "field": "authors", + "comparator": "sizeMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1, + "aggregation": "NC", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "false" + }, + "layer3": { + "fields": [ + { + "field": "title", + "comparator": "LevensteinTitle", + "weight": 1, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.99, + "aggregation": "W_MEAN", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "false" + } + }, + "model" : [ + { "name" : "doi", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" }, + { "name" : "pid", "type" : "JSON","path" : "$.pid", "overrideMatch" : "true" }, + { "name" : "title", "type" : "String", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 }, + { "name" : "authors", "type" : "List", "path" : "$.author[*].fullname", "size" : 200 }, + { "name" : "resulttype", "type" : "String", "path" : "$.resulttype.classid" } + ], + "synonyms": {}, + "blacklists" : { + "title" : [ + "^Inside Front Cover$", + "(?i)^Poster presentations$", + "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", + "^Problems with perinatal pathology\\.?$", + "(?i)^Cases? of Puerperal Convulsions$", + "(?i)^Operative Gyna?ecology$", + "(?i)^Mind the gap\\!?\\:?$", + "^Chronic fatigue syndrome\\.?$", + "^Cartas? ao editor Letters? to the Editor$", + "^Note from the Editor$", + "^Anesthesia Abstract$", + + "^Annual report$", + "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", + "(?i)^Graph and Table of Infectious Diseases?$", + "^Presentation$", + "(?i)^Reviews and Information on Publications$", + "(?i)^PUBLIC HEALTH SERVICES?$", + "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", + "(?i)^Adrese autora$", + "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", + "(?i)^Acknowledgement to Referees$", + "(?i)^Behçet's disease\\.?$", + "(?i)^Isolation and identification of restriction endonuclease.*$", + "(?i)^CEREBROVASCULAR DISEASES?.?$", + "(?i)^Screening for abdominal aortic aneurysms?\\.?$", + "^Event management$", + "(?i)^Breakfast and Crohn's disease.*\\.?$", + "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", + "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", + "^Gushi hakubutsugaku$", + + "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", + "^Intestinal spirocha?etosis$", + "^Treatment of Rodent Ulcer$", + "(?i)^\\W*Cloud Computing\\W*$", + "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", + "^Free Communications, Poster Presentations: Session [A-F]$", + + "^“The Historical Aspects? of Quackery\\.?”$", + "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", + "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", + "(?i)^Case Report$", + "^Boletín Informativo$", + "(?i)^Glioblastoma Multiforme$", + "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", + "^Zaměstnanecké výhody$", + "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", + "(?i)^Carotid body tumours?\\.?$", + "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", + "^Avant-propos$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", + "(?i)^PUBLIC HEALTH VERSUS THE STATE$", + "^Viñetas de Cortázar$", + "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", + "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", + "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", + "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", + + "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", + "^Aus der AGMB$", + + "^Znanstveno-stručni prilozi$", + "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", + "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", + "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", + "^Finanční analýza podniku$", + "^Financial analysis( of business)?$", + "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", + "^Jikken nihon shūshinsho$", + "(?i)^CORONER('|s)(s|') INQUESTS$", + "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", + "(?i)^Consultants' contract(s)?$", + "(?i)^Upute autorima$", + "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", + "^Joshi shin kokubun$", + "^Kōtō shōgaku dokuhon nōson'yō$", + "^Jinjō shōgaku shōka$", + "^Shōgaku shūjichō$", + "^Nihon joshi dokuhon$", + "^Joshi shin dokuhon$", + "^Chūtō kanbun dokuhon$", + "^Wabun dokuhon$", + "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", + "(?i)^cardiac rehabilitation$", + "(?i)^Analytical summary$", + "^Thesaurus resolutionum Sacrae Congregationis Concilii$", + "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", + "^Prikazi i osvrti$", + "^Rodinný dům s provozovnou$", + "^Family house with an establishment$", + "^Shinsei chūtō shin kokugun$", + "^Pulmonary alveolar proteinosis(\\.?)$", + "^Shinshū kanbun$", + "^Viñeta(s?) de Rodríguez$", + "(?i)^RUBRIKA UREDNIKA$", + "^A Matching Model of the Academic Publication Market$", + "^Yōgaku kōyō$", + + "^Internetový marketing$", + "^Internet marketing$", + "^Chūtō kokugo dokuhon$", + "^Kokugo dokuhon$", + "^Antibiotic Cover for Dental Extraction(s?)$", + "^Strategie podniku$", + "^Strategy of an Enterprise$", + "(?i)^respiratory disease(s?)(\\.?)$", + "^Award(s?) for Gallantry in Civil Defence$", + "^Podniková kultura$", + "^Corporate Culture$", + "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", + "^Pracovní motivace$", + "^Work Motivation$", + "^Kaitei kōtō jogaku dokuhon$", + "^Konsolidovaná účetní závěrka$", + "^Consolidated Financial Statements$", + "(?i)^intracranial tumour(s?)$", + "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", + "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", + "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", + "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", + "^Úroveň motivačního procesu jako způsobu vedení lidí$", + "^The level of motivation process as a leadership$", + "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", + "(?i)^news and events$", + "(?i)^NOVOSTI I DOGAĐAJI$", + "^Sansū no gakushū$", + "^Posouzení informačního systému firmy a návrh změn$", + "^Information System Assessment and Proposal for ICT Modification$", + "^Stresové zatížení pracovníků ve vybrané profesi$", + "^Stress load in a specific job$", + + "^Sunday: Poster Sessions, Pt.*$", + "^Monday: Poster Sessions, Pt.*$", + "^Wednesday: Poster Sessions, Pt.*", + "^Tuesday: Poster Sessions, Pt.*$", + + "^Analýza reklamy$", + "^Analysis of advertising$", + + "^Shōgaku shūshinsho$", + "^Shōgaku sansū$", + "^Shintei joshi kokubun$", + "^Taishō joshi kokubun dokuhon$", + "^Joshi kokubun$", + + "^Účetní uzávěrka a účetní závěrka v ČR$", + "(?i)^The \"?Causes\"? of Cancer$", + "^Normas para la publicación de artículos$", + "^Editor('|s)(s|') [Rr]eply$", + "^Editor(’|s)(s|’) letter$", + "^Redaktoriaus žodis$", + "^DISCUSSION ON THE PRECEDING PAPER$", + "^Kōtō shōgaku shūshinsho jidōyō$", + "^Shōgaku nihon rekishi$", + "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", + "^Préface$", + "^Occupational [Hh]ealth [Ss]ervices.$", + "^In Memoriam Professor Toshiyuki TAKESHIMA$", + "^Účetní závěrka ve vybraném podniku.*$", + "^Financial statements in selected company$", + "^Abdominal [Aa]ortic [Aa]neurysms.*$", + "^Pseudomyxoma peritonei$", + "^Kazalo autora$", + + "(?i)^uvodna riječ$", + "^Motivace jako způsob vedení lidí$", + "^Motivation as a leadership$", + "^Polyfunkční dům$", + "^Multi\\-funkcional building$", + "^Podnikatelský plán$", + "(?i)^Podnikatelský záměr$", + "(?i)^Business Plan$", + "^Oceňování nemovitostí$", + "^Marketingová komunikace$", + "^Marketing communication$", + "^Sumario Analítico$", + "^Riječ uredništva$", + "^Savjetovanja i priredbe$", + "^Índice$", + "^(Starobosanski nadpisi).*$", + "^Vzdělávání pracovníků v organizaci$", + "^Staff training in organization$", + "^(Life Histories of North American Geometridae).*$", + "^Strategická analýza podniku$", + "^Strategic Analysis of an Enterprise$", + "^Sadržaj$", + "^Upute suradnicima$", + "^Rodinný dům$", + "(?i)^Fami(l)?ly house$", + "^Upute autorima$", + "^Strategic Analysis$", + "^Finanční analýza vybraného podniku$", + "^Finanční analýza$", + "^Riječ urednika$", + "(?i)^Content(s?)$", + "(?i)^Inhalt$", + "^Jinjō shōgaku shūshinsho jidōyō$", + "(?i)^Index$", + "^Chūgaku kokubun kyōkasho$", + "^Retrato de una mujer$", + "^Retrato de un hombre$", + "^Kōtō shōgaku dokuhon$", + "^Shotōka kokugo$", + "^Shōgaku dokuhon$", + "^Jinjō shōgaku kokugo dokuhon$", + "^Shinsei kokugo dokuhon$", + "^Teikoku dokuhon$", + "^Instructions to Authors$", + "^KİTAP TAHLİLİ$", + "^PRZEGLĄD PIŚMIENNICTWA$", + "(?i)^Presentación$", + "^İçindekiler$", + "(?i)^Tabl?e of contents$", + "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", + "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", + "^Editorial( Board)?$", + "(?i)^Editorial \\(English\\)$", + "^Editörden$", + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*", + "(?i)^.*authors['’′]? reply\\.?$", + "(?i)^.*authors['’′]? response\\.?$" + ] + } + } +} \ No newline at end of file