From 72ca3bb9baa054d0e8ffc822d5d20ba0f14972ee Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 18 Dec 2019 16:19:36 +0100 Subject: [PATCH] implementation of new aggregation in the tree node processing --- .../java/eu/dnetlib/pace/DedupLocalTest.java | 21 +-- .../java/eu/dnetlib/pace/DedupTestUtils.java | 6 +- .../config/organization.current.conf.json | 55 +++++--- .../pace/config/organization.strict.conf.json | 20 ++- .../java/eu/dnetlib/pace/tree/CityMatch.java | 2 +- .../java/eu/dnetlib/pace/tree/ExactMatch.java | 3 + .../pace/tree/JaroWinklerNormalizedName.java | 2 + .../eu/dnetlib/pace/tree/JsonListMatch.java | 2 +- .../eu/dnetlib/pace/tree/KeywordMatch.java | 2 +- .../eu/dnetlib/pace/tree/StringListMatch.java | 2 +- .../eu/dnetlib/pace/tree/support/AggType.java | 6 +- .../dnetlib/pace/tree/support/FieldStats.java | 12 +- .../pace/tree/support/TreeNodeDef.java | 2 +- .../pace/tree/support/TreeNodeStats.java | 34 ++++- .../pace/tree/support/TreeProcessor.java | 2 +- .../eu/dnetlib/pace/util/MapDocumentUtil.java | 4 - .../eu/dnetlib/pace/config/stopwords_en.txt | 1 - .../eu/dnetlib/pace/config/stopwords_es.txt | 12 -- .../eu/dnetlib/pace/config/stopwords_fr.txt | 1 - .../pace/comparators/ComparatorTest.java | 9 +- .../eu/dnetlib/pace/config/ConfigTest.java | 25 +--- .../config/organization.current.conf.json | 104 ++++++++++---- .../eu/dnetlib/pace/config/organization.json | 1 + .../config/organization.no_synonyms.conf.json | 128 ++++++++++++++++-- 24 files changed, 314 insertions(+), 142 deletions(-) diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java index 8583ac5..6f9bb13 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java @@ -31,7 +31,7 @@ public class DedupLocalTest extends DedupTestUtils { @Before public void setup() { - config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/publication.current.conf.json", DedupLocalTest.class)); + config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", DedupLocalTest.class)); treeProcessor = new TreeProcessor(config); final SparkSession spark = SparkSession @@ -41,7 +41,7 @@ public class DedupLocalTest extends DedupTestUtils { .getOrCreate(); context = new JavaSparkContext(spark.sparkContext()); - final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json"); + final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/organization.to.fix.json"); entities = context.textFile(dataset.getPath()); } @@ -116,12 +116,12 @@ public class DedupLocalTest extends DedupTestUtils { } -@Ignore + @Ignore @Test public void matchTest(){ - String JSONEntity1 = "{\"dateoftransformation\":\"2018-06-04\",\"originalId\":[\"opendoar____::Universiti_Sains_Malaysia\"],\"collectedfrom\":[{\"value\":\"OpenDOAR\",\"key\":\"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"websiteurl\":{\"value\":\"http://www.usm.my/my/\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Universiti Sains Malaysia\"},\"country\":{\"classid\":\"MY\",\"classname\":\"Malaysia\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2015-08-24\",\"type\":20,\"id\":\"20|opendoar____::04315c25b0eb56eacb967901557f86b1\"}"; - String JSONEntity2 = "{\"dateoftransformation\":\"2019-10-07\",\"originalId\":[\"corda_______::997941627\"],\"collectedfrom\":[{\"value\":\"CORDA - COmmon Research DAta Warehouse\",\"key\":\"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"true\"},\"eclegalperson\":{\"value\":\"true\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"USM\"},\"ecresearchorganization\":{\"value\":\"true\"},\"ecnonprofit\":{\"value\":\"true\"},\"ecenterprise\":{\"value\":\"false\"},\"websiteurl\":{\"value\":\"http://www.usm.my/my\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"UNIVERSITI SAINS MALAYSIA*\"},\"country\":{\"classid\":\"MY\",\"classname\":\"Malaysia\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"true\"}}},\"dateofcollection\":\"2015-09-10\",\"type\":20,\"id\":\"20|corda_______::1fb0c86ddf389377454d5520d2796dad\"}"; + String JSONEntity1 = "{\"dateoftransformation\":\"2019-10-14 08:59:35.295767\",\"originalId\":[\"openorgs____::0000010656\"],\"pid\":[{\"qualifier\":{\"classid\":\"ISNI\",\"classname\":\"ISNI\",\"schemename\":\"dnet:pid_types\",\"schemeid\":\"dnet:pid_types\"},\"value\":\"0000 0004 0370 7052\"},{\"qualifier\":{\"classid\":\"Wikidata\",\"classname\":\"Wikidata\",\"schemename\":\"dnet:pid_types\",\"schemeid\":\"dnet:pid_types\"},\"value\":\"Q17012267\"},{\"qualifier\":{\"classid\":\"grid.ac\",\"classname\":\"grid.ac\",\"schemename\":\"dnet:pid_types\",\"schemeid\":\"dnet:pid_types\"},\"value\":\"grid.418822.5\"}],\"collectedfrom\":[{\"value\":\"OpenOrgs Database\",\"key\":\"10|openaire____::0362fcdb3076765d9c0041ad331553e8\"}],\"organization\":{\"metadata\":{\"legalshortname\":{\"value\":\"ENVIRON (United States)\"},\"websiteurl\":{\"value\":\"http://www.ramboll-environ.com/\"},\"country\":{\"classid\":\"US\",\"classname\":\"United States\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"alternativeNames\":[{\"value\":\"ENVIRON (United States)\"},{\"value\":\"Ramboll Environ\"}],\"legalname\":{\"value\":\"ENVIRON (United States)\"}}},\"dateofcollection\":\"\",\"type\":20,\"id\":\"20|openorgs____::d3c5966e2089c408f43aa899fd0df656\"}"; + String JSONEntity2 = "{\"dateoftransformation\":\"2018-06-04\",\"originalId\":[\"nsf_________::United_States_Military_Academy\"],\"collectedfrom\":[{\"value\":\"NSF - National Science Foundation\",\"key\":\"10|openaire____::dd69b4a1513c9de9f46faf24048da1e8\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"United States Military Academy\"},\"country\":{\"classid\":\"US\",\"classname\":\"United States\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2016-03-10\",\"type\":20,\"id\":\"20|nsf_________::177e8a2cf0c987cf8ac33933ddf3e260\"}"; MapDocument mapDoc1 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity1); MapDocument mapDoc2 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity2); @@ -131,15 +131,4 @@ public class DedupLocalTest extends DedupTestUtils { System.out.println(treeStats); } - -@Ignore - @Test - public void parseJSONEntityTest(){ - String jsonEntity = "{\"dateoftransformation\":\"2018-09-19\",\"originalId\":[\"doajarticles::Sociedade_Brasileira_de_Reumatologia\"],\"collectedfrom\":[{\"value\":\"DOAJ-Articles\",\"key\":\"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"country\":{\"classid\":\"BR\",\"classname\":\"Brazil\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2018-09-19\",\"type\":20,\"id\":\"20|doajarticles::0019ba7a22c5bc733c3206bde28ff568\"}"; - - MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, jsonEntity); - - System.out.println("mapDocument = " + mapDocument); - } - } \ No newline at end of file diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestUtils.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestUtils.java index 9cce092..2a9c940 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestUtils.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestUtils.java @@ -18,9 +18,9 @@ public abstract class DedupTestUtils { System.out.println(cc); }); //print nondeduped - nonDeduplicated.foreach(cc -> { - System.out.println(cc); - }); +// nonDeduplicated.foreach(cc -> { +// System.out.println(cc.getFieldMap().get("legalname").stringValue()); +// }); System.out.println("Non duplicates: " + nonDeduplicated.count()); System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count()); diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json index 8fc454a..7fa731c 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json @@ -7,6 +7,7 @@ "queueMaxSize" : "2000", "groupMaxSize" : "50", "slidingWindowSize" : "200", + "idPath":"$.id", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true", "maxIterations": "20" @@ -30,7 +31,7 @@ } ], "threshold": 1, - "aggregation": "SC", + "aggregation": "AVG", "positive": "MATCH", "negative": "NO_MATCH", "undefined": "layer2", @@ -51,10 +52,24 @@ "weight": 1, "countIfUndefined": "true", "params": {} + }, + { + "field": "legalname", + "comparator": "numbersMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + }, + { + "field": "legalname", + "comparator": "romansMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} } ], "threshold": 1, - "aggregation": "NC", + "aggregation": "AND", "positive": "layer3", "negative": "NO_MATCH", "undefined": "layer3", @@ -68,12 +83,11 @@ "weight": 1.0, "countIfUndefined": "true", "params": { - "windowSize": "4", - "threshold": "0.0" + "windowSize": "4" } } ], - "threshold": 1.0, + "threshold": 0.7, "aggregation": "W_MEAN", "positive": "layer4", "negative": "NO_MATCH", @@ -86,19 +100,18 @@ "field": "legalname", "comparator": "keywordMatch", "weight": 1.0, - "countIfUndefined": "false", + "countIfUndefined": "true", "params": { - "windowSize": "4", - "threshold": "0.7" + "windowSize": "4" } } ], - "threshold": 1.0, - "aggregation": "W_MEAN", + "threshold": 0.9, + "aggregation": "AVG", "positive": "layer5", "negative": "NO_MATCH", "undefined": "layer5", - "ignoreUndefined": "false" + "ignoreUndefined": "true" }, "layer5": { "fields": [ @@ -119,7 +132,7 @@ "params": {} } ], - "threshold": 0.9, + "threshold": 0.99, "aggregation": "W_MEAN", "positive": "MATCH", "negative": "NO_MATCH", @@ -128,11 +141,12 @@ } }, "model" : [ - { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, - { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, - { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, - { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, - { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} + { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, + { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : { "legalname" : [] @@ -232,7 +246,7 @@ "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], - "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], + "key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], @@ -242,7 +256,10 @@ "key::102": ["informatics","informatica","informática","informática","informatica",""], "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], - "key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"] + "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"], + "key::106" : ["seminary", "seminario", "seminaire", "seminar"], + "key::107" : ["agricultural forestry", "af", "a f"], + "key::108" : ["agricultural mechanical", "am", "a m"] } } } \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json index d545f5b..8e50949 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/organization.strict.conf.json @@ -74,7 +74,7 @@ } ], "threshold": 1, - "aggregation": "SC", + "aggregation": "OR", "positive": "MATCH", "negative": "NO_MATCH", "undefined": "layer2", @@ -112,7 +112,7 @@ } ], "threshold": 1, - "aggregation": "NC", + "aggregation": "AND", "positive": "layer3", "negative": "NO_MATCH", "undefined": "layer3", @@ -126,12 +126,11 @@ "weight": 1.0, "countIfUndefined": "true", "params": { - "windowSize": "4", - "threshold": "0.7" + "windowSize": "4" } } ], - "threshold": 1.0, + "threshold": 0.7, "aggregation": "W_MEAN", "positive": "layer4", "negative": "NO_MATCH", @@ -146,13 +145,12 @@ "weight": 1.0, "countIfUndefined": "true", "params": { - "windowSize": "4", - "threshold": "0.9" + "windowSize": "4" } } ], - "threshold": 1.0, - "aggregation": "W_MEAN", + "threshold": 0.9, + "aggregation": "AVG", "positive": "layer5", "negative": "NO_MATCH", "undefined": "layer5", @@ -303,8 +301,8 @@ "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"], "key::106" : ["seminary", "seminario", "seminaire", "seminar"], - "key::107" : ["agricultural forestry", "af", "a f", "a&f"], - "key::108" : ["agricultural mechanical", "am", "a m", "a&m"] + "key::107" : ["agricultural forestry", "af", "a f"], + "key::108" : ["agricultural mechanical", "am", "a m"] } } } \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java index 2362f47..8ff818e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java @@ -41,7 +41,7 @@ public class CityMatch extends AbstractComparator { else { if (codes1.isEmpty() ^ codes2.isEmpty()) return -1; //undefined if one of the two has no cities - return commonElementsPercentage(codes1, codes2) > Double.parseDouble(params.getOrDefault("threshold", "0.0")) ? 1.0 : 0.0; + return commonElementsPercentage(codes1, codes2); } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java index f8b7b74..21479cf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -24,6 +24,9 @@ public class ExactMatch extends AbstractComparator { @Override public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1.0; //return -1 if a field is missing + } return a.equals(b) ? 1.0 : 0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index b89cffa..76af574 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -39,6 +39,8 @@ public class JaroWinklerNormalizedName extends AbstractComparator { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); + //TODO change this implementation, it needs only to erase cities and keywords + Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index 166151d..4fea8d8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -46,7 +46,7 @@ public class JsonListMatch extends AbstractComparator { return 0.0; } - return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0; + return (double)incommon / (incommon + simDiff); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java index 476c390..40a90a0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java @@ -41,7 +41,7 @@ public class KeywordMatch extends AbstractComparator { else { if (codes1.isEmpty() ^ codes2.isEmpty()) return -1; //undefined if one of the two has no keywords - return commonElementsPercentage(codes1, codes2) > Double.parseDouble(params.getOrDefault("threshold", "0.0")) ? 1.0 : 0.0; + return commonElementsPercentage(codes1, codes2); } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java index f9b53d3..3ed98a0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java @@ -41,7 +41,7 @@ public class StringListMatch extends AbstractComparator { return 0.0; } - return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0; + return (double)incommon / (incommon + simDiff); } } \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java index 697294a..6ea8172 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -9,10 +9,8 @@ public enum AggType { SUM, MAX, MIN, - NC, //necessary condition - SC, //sufficient condition - AND, - OR; + AND, //used for necessary conditions + OR; //used for sufficient conditions public static AggType getEnum(String value) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java index b1341fc..072a227 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -11,20 +11,30 @@ import java.io.Serializable; public class FieldStats implements Serializable { private double weight; //weight for the field (to be used in the aggregation) + private double threshold; //threshold for the field (to be used in case it is a sufficient or a necessary condition) private double result; //the result of the comparison private Field a; private Field b; private boolean countIfUndefined; - public FieldStats(double weight, double result, boolean countIfUndefined, Field a, Field b) { + public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) { this.weight = weight; + this.threshold = threshold; this.result = result; this.countIfUndefined = countIfUndefined; this.a = a; this.b = b; } + public double getThreshold() { + return threshold; + } + + public void setThreshold(double threshold) { + this.threshold = threshold; + } + public double getWeight() { return weight; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 57552e6..037ed72 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -47,7 +47,7 @@ public class TreeNodeDef implements Serializable { double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); - stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()))); + stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()))); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java index c5b1d0f..f9612a4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java @@ -84,6 +84,32 @@ public class TreeNodeStats implements Serializable { return min; } + //if at least one is true, return 1.0 + public double or(){ + for (FieldStats fieldStats : this.results.values()) { + if (fieldStats.getResult() >= fieldStats.getThreshold()) + return 1.0; + } + return 0.0; + } + + //if at least one is false, return 0.0 + public double and() { + for (FieldStats fieldStats : this.results.values()) { + + if (fieldStats.getResult() == -1) { + if (fieldStats.isCountIfUndefined()) + return 0.0; + } + else { + if (fieldStats.getResult() < fieldStats.getThreshold()) + return 0.0; + } + + } + return 1.0; + } + public double getFinalScore(AggType aggregation){ switch (aggregation){ @@ -91,16 +117,16 @@ public class TreeNodeStats implements Serializable { return scoreSum()/fieldsCount(); case SUM: return scoreSum(); - case SC: - case OR: case MAX: return max(); - case NC: - case AND: case MIN: return min(); case W_MEAN: return weightedScoreSum()/weightSum(); + case OR: + return or(); + case AND: + return and(); default: return 0.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index b9af359..731f659 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -38,7 +38,7 @@ public class TreeProcessor{ TreeNodeDef currentNode = config.decisionTree().get(current); //throw an exception if the node doesn't exist if (currentNode == null) - throw new PaceException("The Tree Node doesn't exist: " + current); + throw new PaceException("Missing tree node: " + current); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); treeStats.addNodeStats(current, stats); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java index 54da950..a4d8c96 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -18,13 +18,9 @@ import java.util.function.Predicate; public class MapDocumentUtil { - - private static final ObjectMapper mapper = new ObjectMapper(); public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); - - public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { MapDocument m = new MapDocument(); m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_en.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_en.txt index 9a76d82..0a013d3 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_en.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_en.txt @@ -100,7 +100,6 @@ that the their theirs -them themselves then there diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_es.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_es.txt index 0cf607d..79d31a0 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_es.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_es.txt @@ -1,14 +1,3 @@ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -_ a actualmente acuerdo @@ -637,7 +626,6 @@ todavia todavía todo todos -total trabaja trabajais trabajamos diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_fr.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_fr.txt index 0e2789f..f767a12 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_fr.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_fr.txt @@ -211,7 +211,6 @@ encore enfin entre envers -environ es essai est diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 9bd3a44..7365dba 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -62,17 +62,16 @@ public class ComparatorTest extends AbstractPaceFunctions { final KeywordMatch keywordMatch = new KeywordMatch(params); - assertEquals(0.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); + assertEquals(0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf)); assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf)); assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf)); - assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); - assertEquals(0.0, keywordMatch.distance("University College London", "University of London", conf)); - assertEquals(0.0, keywordMatch.distance("Washington State University", "University of Washington", conf)); + assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); + assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf)); + assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf)); assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf)); - } @Test diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 5d66434..3b2917c 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -65,36 +65,23 @@ public class ConfigTest extends AbstractPaceTest { } @Test - public void asMapDocumentTest() throws Exception { + public void asMapDocumentTest() { - DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json")); - final String json = readFromClasspath("pub2.json"); + final String json = readFromClasspath("organization.json"); final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); System.out.println("mapDocument = " + mapDocument.getFieldMap()); - - - System.out.println(mapDocument.getFieldMap().values().stream().map(Field::isEmpty).count()); - } - - - - @Test public void testJPath() { - final String json = readFromClasspath("pub2.json"); - - final String jpath ="$.pid"; - - - final List jPathList = MapDocumentUtil.getJPathList(jpath, json, Type.JSON); - - System.out.println("jPathList = " + jPathList); + final String json = readFromClasspath("organization.json"); + final String jpath ="$.id"; + System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json)); } } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json index a16ade3..d70609e 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json @@ -1,6 +1,6 @@ { "wf" : { - "threshold" : "0.9", + "threshold" : "0.99", "dedupRun" : "001", "entityType" : "organization", "orderField" : "legalname", @@ -8,7 +8,9 @@ "groupMaxSize" : "50", "slidingWindowSize" : "200", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], - "includeChildren" : "true" + "includeChildren" : "true", + "maxIterations": "20", + "idPath": "$.id" }, "pace" : { "clustering" : [ @@ -23,59 +25,110 @@ { "field": "gridid", "comparator": "exactMatch", - "weight": 1.0, - "countIfUndefined": "true", + "weight": 1, + "countIfUndefined": "false", "params": {} } ], - "threshold": 1.0, - "aggregation": "MAX", + "threshold": 1, + "aggregation": "SC", "positive": "MATCH", - "negative": "layer2", + "negative": "NO_MATCH", "undefined": "layer2", - "ignoreUndefined": "true" + "ignoreUndefined": "false" }, "layer2": { "fields": [ { "field": "websiteurl", "comparator": "domainExactMatch", - "weight": 1.0, - "countIfUndefined": "true", + "weight": 1, + "countIfUndefined": "false", "params": {} }, { "field": "country", "comparator": "exactMatch", - "weight": 1.0, - "countIfUndefined": "false", + "weight": 1, + "countIfUndefined": "true", + "params": {} + }, + { + "field": "legalname", + "comparator": "numbersMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + }, + { + "field": "legalname", + "comparator": "romansMatch", + "weight": 1, + "countIfUndefined": "true", "params": {} } ], - "threshold": 1.0, - "aggregation": "MIN", + "threshold": 1, + "aggregation": "NC", "positive": "layer3", "negative": "NO_MATCH", "undefined": "layer3", - "ignoreUndefined": "false" + "ignoreUndefined": "true" }, "layer3": { "fields": [ { "field": "legalname", - "comparator": "jaroWinklerNormalizedName", - "weight": 0.9, + "comparator": "cityMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + } + ], + "threshold": 0.1, + "aggregation": "W_MEAN", + "positive": "layer4", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + }, + "layer4": { + "fields": [ + { + "field": "legalname", + "comparator": "keywordMatch", + "weight": 1.0, "countIfUndefined": "false", "params": { - "windowSize": 4, - "threshold": 0.7 + "windowSize": "4" + } + } + ], + "threshold": 0.7, + "aggregation": "W_MEAN", + "positive": "layer5", + "negative": "NO_MATCH", + "undefined": "layer5", + "ignoreUndefined": "false" + }, + "layer5": { + "fields": [ + { + "field": "legalname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.9, + "countIfUndefined": "true", + "params": { + "windowSize": "4" } }, { "field": "legalshortname", "comparator": "jaroWinklerNormalizedName", "weight": 0.1, - "countIfUndefined": "true", + "countIfUndefined": "false", "params": {} } ], @@ -90,9 +143,9 @@ "model" : [ { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, - { "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" }, + { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"} + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid=='grid.ac')].value"} ], "blacklists" : { "legalname" : [] @@ -192,7 +245,7 @@ "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], - "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], + "key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], @@ -202,7 +255,10 @@ "key::102": ["informatics","informatica","informática","informática","informatica",""], "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], - "key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"] + "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"], + "key::106" : ["seminary", "seminario", "seminaire", "seminar"], + "key::107" : ["agricultural forestry", "af", "a f", "a&f"], + "key::108" : ["agricultural mechanical", "am", "a m", "a&m"] } } } \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.json index e69de29..c2e44e1 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.json @@ -0,0 +1 @@ +{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000000985"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0004 0478 6426"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000126"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000190"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000205"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005822"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005823"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005824"},{"qualifier":{"classid":"OrgRef","classname":"OrgRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"70243"},{"qualifier":{"classid":"Wikidata","classname":"Wikidata","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"Q503577"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.239119.1"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"USDoC"},"websiteurl":{"value":"http://www.commerce.gov/"},"country":{"classid":"US","classname":"United States","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Departamento de Comercio de Estados Unidos"},{"value":"Département du commerce des États-unis"},{"value":"United States Department of Commerce"},{"value":"United States Department of Commerce and Labor"}],"legalname":{"value":"United States Department of Commerce"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::051dc42607887282d1939f094e5906f5"} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf.json index 80a53c8..5e4eafc 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf.json @@ -1,6 +1,6 @@ { "wf" : { - "threshold" : "0.9", + "threshold" : "0.99", "dedupRun" : "001", "entityType" : "organization", "orderField" : "legalname", @@ -8,7 +8,8 @@ "groupMaxSize" : "50", "slidingWindowSize" : "200", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], - "includeChildren" : "true" + "includeChildren" : "true", + "maxIterations": "20" }, "pace" : { "clustering" : [ @@ -18,21 +19,124 @@ { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } ], "decisionTree" : { - "start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"}, - "layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"}, - "layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"} + "start": { + "fields": [ + { + "field": "gridid", + "comparator": "exactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1, + "aggregation": "SC", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "layer2", + "ignoreUndefined": "false" + }, + "layer2": { + "fields": [ + { + "field": "websiteurl", + "comparator": "domainExactMatch", + "weight": 1, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "country", + "comparator": "exactMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 1, + "aggregation": "NC", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "true" + }, + "layer3": { + "fields": [ + { + "field": "legalname", + "comparator": "cityMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "windowSize": "4", + "threshold": "0.0" + } + } + ], + "threshold": 1.0, + "aggregation": "W_MEAN", + "positive": "layer4", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + }, + "layer4": { + "fields": [ + { + "field": "legalname", + "comparator": "keywordMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "windowSize": "4", + "threshold": "0.7" + } + } + ], + "threshold": 1.0, + "aggregation": "W_MEAN", + "positive": "layer5", + "negative": "NO_MATCH", + "undefined": "layer5", + "ignoreUndefined": "false" + }, + "layer5": { + "fields": [ + { + "field": "legalname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.9, + "countIfUndefined": "true", + "params": { + "windowSize": "4" + } + }, + { + "field": "legalshortname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 0.9, + "aggregation": "W_MEAN", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } }, "model" : [ - { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, - { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, - { "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" }, - { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"} + { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, + { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, + { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} ], "blacklists" : { "legalname" : [] }, - "synonyms": { - } + "synonyms": {} } } \ No newline at end of file