implementation of new aggregation in the tree node processing

This commit is contained in:
miconis 2019-12-18 16:19:36 +01:00
parent 4af490221b
commit 72ca3bb9ba
24 changed files with 314 additions and 142 deletions

View File

@ -31,7 +31,7 @@ public class DedupLocalTest extends DedupTestUtils {
@Before @Before
public void setup() { public void setup() {
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/publication.current.conf.json", DedupLocalTest.class)); config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", DedupLocalTest.class));
treeProcessor = new TreeProcessor(config); treeProcessor = new TreeProcessor(config);
final SparkSession spark = SparkSession final SparkSession spark = SparkSession
@ -41,7 +41,7 @@ public class DedupLocalTest extends DedupTestUtils {
.getOrCreate(); .getOrCreate();
context = new JavaSparkContext(spark.sparkContext()); context = new JavaSparkContext(spark.sparkContext());
final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json"); final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/organization.to.fix.json");
entities = context.textFile(dataset.getPath()); entities = context.textFile(dataset.getPath());
} }
@ -116,12 +116,12 @@ public class DedupLocalTest extends DedupTestUtils {
} }
@Ignore @Ignore
@Test @Test
public void matchTest(){ public void matchTest(){
String JSONEntity1 = "{\"dateoftransformation\":\"2018-06-04\",\"originalId\":[\"opendoar____::Universiti_Sains_Malaysia\"],\"collectedfrom\":[{\"value\":\"OpenDOAR\",\"key\":\"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"websiteurl\":{\"value\":\"http://www.usm.my/my/\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Universiti Sains Malaysia\"},\"country\":{\"classid\":\"MY\",\"classname\":\"Malaysia\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2015-08-24\",\"type\":20,\"id\":\"20|opendoar____::04315c25b0eb56eacb967901557f86b1\"}"; String JSONEntity1 = "{\"dateoftransformation\":\"2019-10-14 08:59:35.295767\",\"originalId\":[\"openorgs____::0000010656\"],\"pid\":[{\"qualifier\":{\"classid\":\"ISNI\",\"classname\":\"ISNI\",\"schemename\":\"dnet:pid_types\",\"schemeid\":\"dnet:pid_types\"},\"value\":\"0000 0004 0370 7052\"},{\"qualifier\":{\"classid\":\"Wikidata\",\"classname\":\"Wikidata\",\"schemename\":\"dnet:pid_types\",\"schemeid\":\"dnet:pid_types\"},\"value\":\"Q17012267\"},{\"qualifier\":{\"classid\":\"grid.ac\",\"classname\":\"grid.ac\",\"schemename\":\"dnet:pid_types\",\"schemeid\":\"dnet:pid_types\"},\"value\":\"grid.418822.5\"}],\"collectedfrom\":[{\"value\":\"OpenOrgs Database\",\"key\":\"10|openaire____::0362fcdb3076765d9c0041ad331553e8\"}],\"organization\":{\"metadata\":{\"legalshortname\":{\"value\":\"ENVIRON (United States)\"},\"websiteurl\":{\"value\":\"http://www.ramboll-environ.com/\"},\"country\":{\"classid\":\"US\",\"classname\":\"United States\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"alternativeNames\":[{\"value\":\"ENVIRON (United States)\"},{\"value\":\"Ramboll Environ\"}],\"legalname\":{\"value\":\"ENVIRON (United States)\"}}},\"dateofcollection\":\"\",\"type\":20,\"id\":\"20|openorgs____::d3c5966e2089c408f43aa899fd0df656\"}";
String JSONEntity2 = "{\"dateoftransformation\":\"2019-10-07\",\"originalId\":[\"corda_______::997941627\"],\"collectedfrom\":[{\"value\":\"CORDA - COmmon Research DAta Warehouse\",\"key\":\"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"true\"},\"eclegalperson\":{\"value\":\"true\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"USM\"},\"ecresearchorganization\":{\"value\":\"true\"},\"ecnonprofit\":{\"value\":\"true\"},\"ecenterprise\":{\"value\":\"false\"},\"websiteurl\":{\"value\":\"http://www.usm.my/my\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"UNIVERSITI SAINS MALAYSIA*\"},\"country\":{\"classid\":\"MY\",\"classname\":\"Malaysia\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"true\"}}},\"dateofcollection\":\"2015-09-10\",\"type\":20,\"id\":\"20|corda_______::1fb0c86ddf389377454d5520d2796dad\"}"; String JSONEntity2 = "{\"dateoftransformation\":\"2018-06-04\",\"originalId\":[\"nsf_________::United_States_Military_Academy\"],\"collectedfrom\":[{\"value\":\"NSF - National Science Foundation\",\"key\":\"10|openaire____::dd69b4a1513c9de9f46faf24048da1e8\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"United States Military Academy\"},\"country\":{\"classid\":\"US\",\"classname\":\"United States\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2016-03-10\",\"type\":20,\"id\":\"20|nsf_________::177e8a2cf0c987cf8ac33933ddf3e260\"}";
MapDocument mapDoc1 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity1); MapDocument mapDoc1 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity1);
MapDocument mapDoc2 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity2); MapDocument mapDoc2 = MapDocumentUtil.asMapDocumentWithJPath(config, JSONEntity2);
@ -131,15 +131,4 @@ public class DedupLocalTest extends DedupTestUtils {
System.out.println(treeStats); System.out.println(treeStats);
} }
@Ignore
@Test
public void parseJSONEntityTest(){
String jsonEntity = "{\"dateoftransformation\":\"2018-09-19\",\"originalId\":[\"doajarticles::Sociedade_Brasileira_de_Reumatologia\"],\"collectedfrom\":[{\"value\":\"DOAJ-Articles\",\"key\":\"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824\"}],\"organization\":{\"metadata\":{\"eclegalbody\":{\"value\":\"false\"},\"eclegalperson\":{\"value\":\"false\"},\"ecinternationalorganization\":{\"value\":\"false\"},\"legalshortname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"ecresearchorganization\":{\"value\":\"false\"},\"ecnonprofit\":{\"value\":\"false\"},\"ecenterprise\":{\"value\":\"false\"},\"ecnutscode\":{\"value\":\"false\"},\"ecinternationalorganizationeurinterests\":{\"value\":\"false\"},\"legalname\":{\"value\":\"Sociedade Brasileira de Reumatologia\"},\"country\":{\"classid\":\"BR\",\"classname\":\"Brazil\",\"schemename\":\"dnet:countries\",\"schemeid\":\"dnet:countries\"},\"echighereducation\":{\"value\":\"false\"},\"ecsmevalidated\":{\"value\":\"false\"}}},\"dateofcollection\":\"2018-09-19\",\"type\":20,\"id\":\"20|doajarticles::0019ba7a22c5bc733c3206bde28ff568\"}";
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, jsonEntity);
System.out.println("mapDocument = " + mapDocument);
}
} }

View File

@ -18,9 +18,9 @@ public abstract class DedupTestUtils {
System.out.println(cc); System.out.println(cc);
}); });
//print nondeduped //print nondeduped
nonDeduplicated.foreach(cc -> { // nonDeduplicated.foreach(cc -> {
System.out.println(cc); // System.out.println(cc.getFieldMap().get("legalname").stringValue());
}); // });
System.out.println("Non duplicates: " + nonDeduplicated.count()); System.out.println("Non duplicates: " + nonDeduplicated.count());
System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count()); System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());

View File

@ -7,6 +7,7 @@
"queueMaxSize" : "2000", "queueMaxSize" : "2000",
"groupMaxSize" : "50", "groupMaxSize" : "50",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"idPath":"$.id",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true", "includeChildren" : "true",
"maxIterations": "20" "maxIterations": "20"
@ -30,7 +31,7 @@
} }
], ],
"threshold": 1, "threshold": 1,
"aggregation": "SC", "aggregation": "AVG",
"positive": "MATCH", "positive": "MATCH",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer2", "undefined": "layer2",
@ -51,10 +52,24 @@
"weight": 1, "weight": 1,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": {} "params": {}
},
{
"field": "legalname",
"comparator": "numbersMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
},
{
"field": "legalname",
"comparator": "romansMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
} }
], ],
"threshold": 1, "threshold": 1,
"aggregation": "NC", "aggregation": "AND",
"positive": "layer3", "positive": "layer3",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer3", "undefined": "layer3",
@ -68,12 +83,11 @@
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": { "params": {
"windowSize": "4", "windowSize": "4"
"threshold": "0.0"
} }
} }
], ],
"threshold": 1.0, "threshold": 0.7,
"aggregation": "W_MEAN", "aggregation": "W_MEAN",
"positive": "layer4", "positive": "layer4",
"negative": "NO_MATCH", "negative": "NO_MATCH",
@ -86,19 +100,18 @@
"field": "legalname", "field": "legalname",
"comparator": "keywordMatch", "comparator": "keywordMatch",
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "false", "countIfUndefined": "true",
"params": { "params": {
"windowSize": "4", "windowSize": "4"
"threshold": "0.7"
} }
} }
], ],
"threshold": 1.0, "threshold": 0.9,
"aggregation": "W_MEAN", "aggregation": "AVG",
"positive": "layer5", "positive": "layer5",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer5", "undefined": "layer5",
"ignoreUndefined": "false" "ignoreUndefined": "true"
}, },
"layer5": { "layer5": {
"fields": [ "fields": [
@ -119,7 +132,7 @@
"params": {} "params": {}
} }
], ],
"threshold": 0.9, "threshold": 0.99,
"aggregation": "W_MEAN", "aggregation": "W_MEAN",
"positive": "MATCH", "positive": "MATCH",
"negative": "NO_MATCH", "negative": "NO_MATCH",
@ -128,11 +141,12 @@
} }
}, },
"model" : [ "model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
], ],
"blacklists" : { "blacklists" : {
"legalname" : [] "legalname" : []
@ -232,7 +246,7 @@
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], "key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
@ -242,7 +256,10 @@
"key::102": ["informatics","informatica","informática","informática","informatica",""], "key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
"key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"] "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
"key::107" : ["agricultural forestry", "af", "a f"],
"key::108" : ["agricultural mechanical", "am", "a m"]
} }
} }
} }

View File

@ -74,7 +74,7 @@
} }
], ],
"threshold": 1, "threshold": 1,
"aggregation": "SC", "aggregation": "OR",
"positive": "MATCH", "positive": "MATCH",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer2", "undefined": "layer2",
@ -112,7 +112,7 @@
} }
], ],
"threshold": 1, "threshold": 1,
"aggregation": "NC", "aggregation": "AND",
"positive": "layer3", "positive": "layer3",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer3", "undefined": "layer3",
@ -126,12 +126,11 @@
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": { "params": {
"windowSize": "4", "windowSize": "4"
"threshold": "0.7"
} }
} }
], ],
"threshold": 1.0, "threshold": 0.7,
"aggregation": "W_MEAN", "aggregation": "W_MEAN",
"positive": "layer4", "positive": "layer4",
"negative": "NO_MATCH", "negative": "NO_MATCH",
@ -146,13 +145,12 @@
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": { "params": {
"windowSize": "4", "windowSize": "4"
"threshold": "0.9"
} }
} }
], ],
"threshold": 1.0, "threshold": 0.9,
"aggregation": "W_MEAN", "aggregation": "AVG",
"positive": "layer5", "positive": "layer5",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer5", "undefined": "layer5",
@ -303,8 +301,8 @@
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"], "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
"key::106" : ["seminary", "seminario", "seminaire", "seminar"], "key::106" : ["seminary", "seminario", "seminaire", "seminar"],
"key::107" : ["agricultural forestry", "af", "a f", "a&f"], "key::107" : ["agricultural forestry", "af", "a f"],
"key::108" : ["agricultural mechanical", "am", "a m", "a&m"] "key::108" : ["agricultural mechanical", "am", "a m"]
} }
} }
} }

View File

@ -41,7 +41,7 @@ public class CityMatch extends AbstractComparator {
else { else {
if (codes1.isEmpty() ^ codes2.isEmpty()) if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no cities return -1; //undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2) > Double.parseDouble(params.getOrDefault("threshold", "0.0")) ? 1.0 : 0.0; return commonElementsPercentage(codes1, codes2);
} }
} }
} }

View File

@ -24,6 +24,9 @@ public class ExactMatch extends AbstractComparator {
@Override @Override
public double distance(final String a, final String b, final Config conf) { public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1.0; //return -1 if a field is missing
}
return a.equals(b) ? 1.0 : 0; return a.equals(b) ? 1.0 : 0;
} }

View File

@ -39,6 +39,8 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
ca = filterAllStopWords(ca); ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb); cb = filterAllStopWords(cb);
//TODO change this implementation, it needs only to erase cities and keywords
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));

View File

@ -46,7 +46,7 @@ public class JsonListMatch extends AbstractComparator {
return 0.0; return 0.0;
} }
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0; return (double)incommon / (incommon + simDiff);
} }

View File

@ -41,7 +41,7 @@ public class KeywordMatch extends AbstractComparator {
else { else {
if (codes1.isEmpty() ^ codes2.isEmpty()) if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no keywords return -1; //undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2) > Double.parseDouble(params.getOrDefault("threshold", "0.0")) ? 1.0 : 0.0; return commonElementsPercentage(codes1, codes2);
} }
} }
} }

View File

@ -41,7 +41,7 @@ public class StringListMatch extends AbstractComparator {
return 0.0; return 0.0;
} }
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0; return (double)incommon / (incommon + simDiff);
} }
} }

View File

@ -9,10 +9,8 @@ public enum AggType {
SUM, SUM,
MAX, MAX,
MIN, MIN,
NC, //necessary condition AND, //used for necessary conditions
SC, //sufficient condition OR; //used for sufficient conditions
AND,
OR;
public static AggType getEnum(String value) { public static AggType getEnum(String value) {

View File

@ -11,20 +11,30 @@ import java.io.Serializable;
public class FieldStats implements Serializable { public class FieldStats implements Serializable {
private double weight; //weight for the field (to be used in the aggregation) private double weight; //weight for the field (to be used in the aggregation)
private double threshold; //threshold for the field (to be used in case it is a sufficient or a necessary condition)
private double result; //the result of the comparison private double result; //the result of the comparison
private Field a; private Field a;
private Field b; private Field b;
private boolean countIfUndefined; private boolean countIfUndefined;
public FieldStats(double weight, double result, boolean countIfUndefined, Field a, Field b) { public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) {
this.weight = weight; this.weight = weight;
this.threshold = threshold;
this.result = result; this.result = result;
this.countIfUndefined = countIfUndefined; this.countIfUndefined = countIfUndefined;
this.a = a; this.a = a;
this.b = b; this.b = b;
} }
public double getThreshold() {
return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public double getWeight() { public double getWeight() {
return weight; return weight;
} }

View File

@ -47,7 +47,7 @@ public class TreeNodeDef implements Serializable {
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()))); stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())));
} }

View File

@ -84,6 +84,32 @@ public class TreeNodeStats implements Serializable {
return min; return min;
} }
//if at least one is true, return 1.0
public double or(){
for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0;
}
return 0.0;
}
//if at least one is false, return 0.0
public double and() {
for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined())
return 0.0;
}
else {
if (fieldStats.getResult() < fieldStats.getThreshold())
return 0.0;
}
}
return 1.0;
}
public double getFinalScore(AggType aggregation){ public double getFinalScore(AggType aggregation){
switch (aggregation){ switch (aggregation){
@ -91,16 +117,16 @@ public class TreeNodeStats implements Serializable {
return scoreSum()/fieldsCount(); return scoreSum()/fieldsCount();
case SUM: case SUM:
return scoreSum(); return scoreSum();
case SC:
case OR:
case MAX: case MAX:
return max(); return max();
case NC:
case AND:
case MIN: case MIN:
return min(); return min();
case W_MEAN: case W_MEAN:
return weightedScoreSum()/weightSum(); return weightedScoreSum()/weightSum();
case OR:
return or();
case AND:
return and();
default: default:
return 0.0; return 0.0;
} }

View File

@ -38,7 +38,7 @@ public class TreeProcessor{
TreeNodeDef currentNode = config.decisionTree().get(current); TreeNodeDef currentNode = config.decisionTree().get(current);
//throw an exception if the node doesn't exist //throw an exception if the node doesn't exist
if (currentNode == null) if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current); throw new PaceException("Missing tree node: " + current);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(current, stats); treeStats.addNodeStats(current, stats);

View File

@ -18,13 +18,9 @@ import java.util.function.Predicate;
public class MapDocumentUtil { public class MapDocumentUtil {
private static final ObjectMapper mapper = new ObjectMapper();
public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX); public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) {
MapDocument m = new MapDocument(); MapDocument m = new MapDocument();
m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json));

View File

@ -100,7 +100,6 @@ that
the the
their their
theirs theirs
them
themselves themselves
then then
there there

View File

@ -1,14 +1,3 @@
0
1
2
3
4
5
6
7
8
9
_
a a
actualmente actualmente
acuerdo acuerdo
@ -637,7 +626,6 @@ todavia
todavía todavía
todo todo
todos todos
total
trabaja trabaja
trabajais trabajais
trabajamos trabajamos

View File

@ -211,7 +211,6 @@ encore
enfin enfin
entre entre
envers envers
environ
es es
essai essai
est est

View File

@ -62,17 +62,16 @@ public class ComparatorTest extends AbstractPaceFunctions {
final KeywordMatch keywordMatch = new KeywordMatch(params); final KeywordMatch keywordMatch = new KeywordMatch(params);
assertEquals(0.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); assertEquals(0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf)); assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf)); assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf)); assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.0, keywordMatch.distance("University College London", "University of London", conf)); assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.0, keywordMatch.distance("Washington State University", "University of Washington", conf)); assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf)); assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
} }
@Test @Test

View File

@ -65,36 +65,23 @@ public class ConfigTest extends AbstractPaceTest {
} }
@Test @Test
public void asMapDocumentTest() throws Exception { public void asMapDocumentTest() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
final String json = readFromClasspath("pub2.json"); final String json = readFromClasspath("organization.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + mapDocument.getFieldMap()); System.out.println("mapDocument = " + mapDocument.getFieldMap());
System.out.println(mapDocument.getFieldMap().values().stream().map(Field::isEmpty).count());
} }
@Test @Test
public void testJPath() { public void testJPath() {
final String json = readFromClasspath("pub2.json"); final String json = readFromClasspath("organization.json");
final String jpath ="$.pid";
final List<String> jPathList = MapDocumentUtil.getJPathList(jpath, json, Type.JSON);
System.out.println("jPathList = " + jPathList);
final String jpath ="$.id";
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
} }
} }

View File

@ -1,6 +1,6 @@
{ {
"wf" : { "wf" : {
"threshold" : "0.9", "threshold" : "0.99",
"dedupRun" : "001", "dedupRun" : "001",
"entityType" : "organization", "entityType" : "organization",
"orderField" : "legalname", "orderField" : "legalname",
@ -8,7 +8,9 @@
"groupMaxSize" : "50", "groupMaxSize" : "50",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true" "includeChildren" : "true",
"maxIterations": "20",
"idPath": "$.id"
}, },
"pace" : { "pace" : {
"clustering" : [ "clustering" : [
@ -23,59 +25,110 @@
{ {
"field": "gridid", "field": "gridid",
"comparator": "exactMatch", "comparator": "exactMatch",
"weight": 1.0, "weight": 1,
"countIfUndefined": "true", "countIfUndefined": "false",
"params": {} "params": {}
} }
], ],
"threshold": 1.0, "threshold": 1,
"aggregation": "MAX", "aggregation": "SC",
"positive": "MATCH", "positive": "MATCH",
"negative": "layer2", "negative": "NO_MATCH",
"undefined": "layer2", "undefined": "layer2",
"ignoreUndefined": "true" "ignoreUndefined": "false"
}, },
"layer2": { "layer2": {
"fields": [ "fields": [
{ {
"field": "websiteurl", "field": "websiteurl",
"comparator": "domainExactMatch", "comparator": "domainExactMatch",
"weight": 1.0, "weight": 1,
"countIfUndefined": "true", "countIfUndefined": "false",
"params": {} "params": {}
}, },
{ {
"field": "country", "field": "country",
"comparator": "exactMatch", "comparator": "exactMatch",
"weight": 1.0, "weight": 1,
"countIfUndefined": "false", "countIfUndefined": "true",
"params": {}
},
{
"field": "legalname",
"comparator": "numbersMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
},
{
"field": "legalname",
"comparator": "romansMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {} "params": {}
} }
], ],
"threshold": 1.0, "threshold": 1,
"aggregation": "MIN", "aggregation": "NC",
"positive": "layer3", "positive": "layer3",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer3", "undefined": "layer3",
"ignoreUndefined": "false" "ignoreUndefined": "true"
}, },
"layer3": { "layer3": {
"fields": [ "fields": [
{ {
"field": "legalname", "field": "legalname",
"comparator": "jaroWinklerNormalizedName", "comparator": "cityMatch",
"weight": 0.9, "weight": 1.0,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
}
}
],
"threshold": 0.1,
"aggregation": "W_MEAN",
"positive": "layer4",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"layer4": {
"fields": [
{
"field": "legalname",
"comparator": "keywordMatch",
"weight": 1.0,
"countIfUndefined": "false", "countIfUndefined": "false",
"params": { "params": {
"windowSize": 4, "windowSize": "4"
"threshold": 0.7 }
}
],
"threshold": 0.7,
"aggregation": "W_MEAN",
"positive": "layer5",
"negative": "NO_MATCH",
"undefined": "layer5",
"ignoreUndefined": "false"
},
"layer5": {
"fields": [
{
"field": "legalname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.9,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
} }
}, },
{ {
"field": "legalshortname", "field": "legalshortname",
"comparator": "jaroWinklerNormalizedName", "comparator": "jaroWinklerNormalizedName",
"weight": 0.1, "weight": 0.1,
"countIfUndefined": "true", "countIfUndefined": "false",
"params": {} "params": {}
} }
], ],
@ -90,9 +143,9 @@
"model" : [ "model" : [
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" }, { "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"} { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid=='grid.ac')].value"}
], ],
"blacklists" : { "blacklists" : {
"legalname" : [] "legalname" : []
@ -192,7 +245,7 @@
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], "key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
@ -202,7 +255,10 @@
"key::102": ["informatics","informatica","informática","informática","informatica",""], "key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
"key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"] "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
"key::107" : ["agricultural forestry", "af", "a f", "a&f"],
"key::108" : ["agricultural mechanical", "am", "a m", "a&m"]
} }
} }
} }

View File

@ -0,0 +1 @@
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000000985"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0004 0478 6426"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000126"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000190"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000205"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005822"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005823"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005824"},{"qualifier":{"classid":"OrgRef","classname":"OrgRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"70243"},{"qualifier":{"classid":"Wikidata","classname":"Wikidata","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"Q503577"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.239119.1"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"USDoC"},"websiteurl":{"value":"http://www.commerce.gov/"},"country":{"classid":"US","classname":"United States","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Departamento de Comercio de Estados Unidos"},{"value":"Département du commerce des États-unis"},{"value":"United States Department of Commerce"},{"value":"United States Department of Commerce and Labor"}],"legalname":{"value":"United States Department of Commerce"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::051dc42607887282d1939f094e5906f5"}

View File

@ -1,6 +1,6 @@
{ {
"wf" : { "wf" : {
"threshold" : "0.9", "threshold" : "0.99",
"dedupRun" : "001", "dedupRun" : "001",
"entityType" : "organization", "entityType" : "organization",
"orderField" : "legalname", "orderField" : "legalname",
@ -8,7 +8,8 @@
"groupMaxSize" : "50", "groupMaxSize" : "50",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true" "includeChildren" : "true",
"maxIterations": "20"
}, },
"pace" : { "pace" : {
"clustering" : [ "clustering" : [
@ -18,21 +19,124 @@
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
], ],
"decisionTree" : { "decisionTree" : {
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"}, "start": {
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"}, "fields": [
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"} {
"field": "gridid",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "SC",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "layer2",
"ignoreUndefined": "false"
},
"layer2": {
"fields": [
{
"field": "websiteurl",
"comparator": "domainExactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "country",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1,
"aggregation": "NC",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "true"
},
"layer3": {
"fields": [
{
"field": "legalname",
"comparator": "cityMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"windowSize": "4",
"threshold": "0.0"
}
}
],
"threshold": 1.0,
"aggregation": "W_MEAN",
"positive": "layer4",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"layer4": {
"fields": [
{
"field": "legalname",
"comparator": "keywordMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"windowSize": "4",
"threshold": "0.7"
}
}
],
"threshold": 1.0,
"aggregation": "W_MEAN",
"positive": "layer5",
"negative": "NO_MATCH",
"undefined": "layer5",
"ignoreUndefined": "false"
},
"layer5": {
"fields": [
{
"field": "legalname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.9,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
}
},
{
"field": "legalshortname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "W_MEAN",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
}, },
"model" : [ "model" : [
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" }, { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"} { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
], ],
"blacklists" : { "blacklists" : {
"legalname" : [] "legalname" : []
}, },
"synonyms": { "synonyms": {}
}
} }
} }