addition of deduplication profiles for the results, double check on pids and the title with a lower threshold
parent
5a043e95ea
commit
0fe40b08e4
@ -0,0 +1,27 @@
|
||||
<RESOURCE_PROFILE>
|
||||
<HEADER>
|
||||
<RESOURCE_IDENTIFIER value="19e7fb88-8c85-4eb4-9644-de6bba5534ef_RGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
|
||||
<RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/>
|
||||
<RESOURCE_KIND value="DedupOrchestrationDSResources"/>
|
||||
<RESOURCE_URI value=""/>
|
||||
<DATE_OF_CREATION value="2020-04-25T08:15:01+00:00"/>
|
||||
</HEADER>
|
||||
<BODY>
|
||||
<CONFIGURATION enabled="true">
|
||||
<DEDUPLICATION>
|
||||
<ENTITY code="20" label="Dedup decisiontree" name="Dedup decisiontree"/>
|
||||
<ACTION_SET id="dedup-similarity-result-decisiontree-v2"/>
|
||||
<SCAN_SEQUENCE>
|
||||
<SCAN id="fabcfb5d-f01d-4e98-ba18-4b36c27f49e8_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
|
||||
<SCAN id="845d98da-eeb4-4d32-823c-1d79d30981f6_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
|
||||
<SCAN id="87f57680-b136-4dcc-9260-6a82355efb01_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
|
||||
<SCAN id="923fe17a-c697-41f2-beb5-f2d72c17334c_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
|
||||
</SCAN_SEQUENCE>
|
||||
</DEDUPLICATION>
|
||||
</CONFIGURATION>
|
||||
<STATUS>
|
||||
<LAST_UPDATE value="2001-12-31T12:00:00"/>
|
||||
</STATUS>
|
||||
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
|
||||
</BODY>
|
||||
</RESOURCE_PROFILE>
|
@ -0,0 +1,128 @@
|
||||
<RESOURCE_PROFILE>
|
||||
<HEADER>
|
||||
<RESOURCE_IDENTIFIER value="923fe17a-c697-41f2-beb5-f2d72c17334c_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
|
||||
<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
|
||||
<RESOURCE_KIND value="DedupConfigurationDSResources"/>
|
||||
<RESOURCE_URI value=""/>
|
||||
<DATE_OF_CREATION value="2020-07-10T16:09:43+00:00"/>
|
||||
</HEADER>
|
||||
<BODY>
|
||||
<CONFIGURATION>
|
||||
<DESCRIPTION>Software: Decision Tree Dedup - v2.0</DESCRIPTION>
|
||||
<DEDUPLICATION>
|
||||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"subEntityType" : "resulttype",
|
||||
"subEntityValue" : "software",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "200",
|
||||
"groupMaxSize" : "100",
|
||||
"maxChildren" : "100",
|
||||
"slidingWindowSize" : "50",
|
||||
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
|
||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "doi",
|
||||
"comparator": "exactMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "url",
|
||||
"comparator": "exactMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1,
|
||||
"aggregation": "OR",
|
||||
"positive": "layer1",
|
||||
"negative": "layer2",
|
||||
"undefined": "layer2",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"layer1": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitleIgnoreVersion",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.9,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"layer2": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitleIgnoreVersion",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "false"
|
||||
}
|
||||
},
|
||||
"model" : [
|
||||
{
|
||||
"name" : "doi",
|
||||
"type" : "String",
|
||||
"path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
|
||||
},
|
||||
{
|
||||
"name" : "title",
|
||||
"type" : "String",
|
||||
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||
"length" : 250,
|
||||
"size" : 5
|
||||
},
|
||||
{
|
||||
"name" : "url",
|
||||
"type" : "String",
|
||||
"path" : "$.instance.url"
|
||||
},
|
||||
{
|
||||
"name" : "resulttype",
|
||||
"type" : "String",
|
||||
"path" : "$.resulttype.classid"
|
||||
}
|
||||
],
|
||||
"blacklists" : {},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
||||
</DEDUPLICATION>
|
||||
</CONFIGURATION>
|
||||
<STATUS>
|
||||
<LAST_UPDATE value="2001-12-31T12:00:00"/>
|
||||
</STATUS>
|
||||
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
|
||||
</BODY>
|
||||
</RESOURCE_PROFILE>
|
Loading…
Reference in New Issue