dnet-hadoop/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.new.conf.json

214 lines
5.6 KiB
JSON

{
"wf": {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "otherresearchproduct",
"orderField" : "title",
"queueMaxSize" : "100",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "100",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true",
"idPath" : "$.id",
"maxIterations" : 20
},
"pace": {
"clustering": [
{
"name": "wordsStatsSuffixPrefixChain",
"fields": [
"title"
],
"params": {
"mod": "10"
}
},
{
"name": "lowercase",
"fields": [
"doi",
"altdoi"
],
"params": {
"collapseOn:pid": "0"
}
}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "pidVSaltid",
"undefined": "pidVSaltid",
"ignoreUndefined": "false"
},
"pidVSaltid": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "softCheck",
"negative": "earlyExits",
"undefined": "earlyExits",
"ignoreUndefined": "true"
},
"softCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"earlyExits": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "strongCheck",
"negative": "NO_MATCH",
"undefined": "strongCheck",
"ignoreUndefined": "false"
},
"strongCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "surnames",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"mode": "surname"
}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "alternateid",
"type": "JSON",
"path": "$.instance[*].alternateIdentifier[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
},
{
"name": "instance",
"type": "List",
"path": "$.instance[*].instancetype.classname"
}
],
"blacklists": {},
"synonyms": {}
}
}