implementation of new configuration for datasource deduplication

This commit is contained in:
miconis 2022-04-26 11:30:40 +02:00
parent fb2eed9f0e
commit 5aebe63f22
2 changed files with 29 additions and 8 deletions

View File

@ -183,7 +183,7 @@ public class DedupLocalTest extends DedupTestUtils {
//custom parameters for this test //custom parameters for this test
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS( DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI()).toFile().getAbsolutePath() Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/ds.tree.conf.json").toURI()).toFile().getAbsolutePath()
)); ));
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()).toFile().getAbsolutePath(); String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()).toFile().getAbsolutePath();
@ -236,7 +236,7 @@ public class DedupLocalTest extends DedupTestUtils {
System.out.println("Total time for mergerels creation : " + mergerels_time); System.out.println("Total time for mergerels creation : " + mergerels_time);
System.out.println("Total time for dedupentity creation : " + dedupentity_time); System.out.println("Total time for dedupentity creation : " + dedupentity_time);
FileUtils.deleteDirectory(new File(workingPath)); // FileUtils.deleteDirectory(new File(workingPath));
} }
@Test //test the match between two JSON @Test //test the match between two JSON

View File

@ -3,7 +3,7 @@
"threshold" : "0.99", "threshold" : "0.99",
"dedupRun" : "001", "dedupRun" : "001",
"entityType" : "datasource", "entityType" : "datasource",
"orderField" : "name", "orderField" : "officialname",
"queueMaxSize" : "2000", "queueMaxSize" : "2000",
"groupMaxSize" : "50", "groupMaxSize" : "50",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
@ -14,8 +14,8 @@
}, },
"pace" : { "pace" : {
"clustering" : [ "clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} }, { "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
{ "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } }, { "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
], ],
"decisionTree" : { "decisionTree" : {
@ -39,16 +39,36 @@
"layer2": { "layer2": {
"fields": [ "fields": [
{ {
"field": "name", "field": "officialname",
"comparator": "levensteinTitle", "comparator": "levensteinTitle",
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": { "params": {
"threshold": 0.9
}
},
{
"field": "englishname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "officialname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"crossCompare": "englishname",
"threshold": 0.9
} }
} }
], ],
"threshold": 0.9, "threshold": 0.9,
"aggregation": "AVG", "aggregation": "MAX",
"positive": "MATCH", "positive": "MATCH",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "NO_MATCH", "undefined": "NO_MATCH",
@ -56,7 +76,8 @@
} }
}, },
"model" : [ "model" : [
{ "name" : "name", "type" : "String", "path" : "$.name" }, { "name" : "englishname", "type" : "String", "path" : "$.englishname" },
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" } { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
], ],
"blacklists" : { "blacklists" : {