dnet-dedup/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf

41 lines
2.3 KiB
Plaintext

{
"wf" : {
"threshold" : "0.9",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"sufficientConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
],
"necessaryConditions" : [
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
{ "name" : "exactMatch", "fields" : [ "country" ] }
],
"decisionTree" : {
"start": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "ignoreMissing":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "ignoreMissing":"true", "params":{}}], "threshold": 0.9, "aggregation": "WEIGHTED_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreMissing": "true"}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid", "ignoreMissing" : "true" },
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value", "ignoreMissing" : "true" },
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7}, "ignoreMissing" : "false" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 }, "ignoreMissing" : "true" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value", "ignoreMissing" : "true" }
],
"blacklists" : {
"legalname" : []
}
}
}