dnet-dedup/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json

87 lines
2.7 KiB
JSON

{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "datasource",
"orderField" : "englishname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"idPath":"$.id",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true",
"maxIterations": "20"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
{"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
],
"decisionTree" : {
"start": {
"fields": [
{
"field": "websiteurl",
"comparator": "domainExactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "AVG",
"positive": "layer2",
"negative": "NO_MATCH",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "officialname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "englishname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "officialname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"crossCompare": "englishname",
"threshold": 0.9
}
}
],
"threshold": 0.9,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model" : [
{ "name" : "englishname", "type" : "String", "path" : "$.englishname" },
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
],
"blacklists" : {},
"synonyms": {}
}
}