33 lines
1.8 KiB
Plaintext
33 lines
1.8 KiB
Plaintext
{
|
|
"wf" : {
|
|
"threshold" : "0.98",
|
|
"dedupRun" : "001",
|
|
"entityType" : "organization",
|
|
"orderField" : "legalname",
|
|
"queueMaxSize" : "2000",
|
|
"groupMaxSize" : "10",
|
|
"slidingWindowSize" : "200",
|
|
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
|
"includeChildren" : "true"
|
|
},
|
|
"pace" : {
|
|
"clustering" : [
|
|
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3"} },
|
|
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
|
],
|
|
"necessaryConditions" : [
|
|
{ "name" : "exactMatch", "fields" : [ "country" ] },
|
|
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
|
|
],
|
|
"decisionTree": {},
|
|
"model" : [
|
|
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
|
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
|
|
{ "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
|
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.7", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : { "windowSize" : 4 } },
|
|
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }
|
|
],
|
|
"blacklists" : { }
|
|
}
|
|
} |