implementation of new configuration for datasource deduplication
This commit is contained in:
parent
fb2eed9f0e
commit
5aebe63f22
|
@ -183,7 +183,7 @@ public class DedupLocalTest extends DedupTestUtils {
|
||||||
|
|
||||||
//custom parameters for this test
|
//custom parameters for this test
|
||||||
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
||||||
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI()).toFile().getAbsolutePath()
|
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/ds.tree.conf.json").toURI()).toFile().getAbsolutePath()
|
||||||
));
|
));
|
||||||
|
|
||||||
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()).toFile().getAbsolutePath();
|
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()).toFile().getAbsolutePath();
|
||||||
|
@ -236,7 +236,7 @@ public class DedupLocalTest extends DedupTestUtils {
|
||||||
System.out.println("Total time for mergerels creation : " + mergerels_time);
|
System.out.println("Total time for mergerels creation : " + mergerels_time);
|
||||||
System.out.println("Total time for dedupentity creation : " + dedupentity_time);
|
System.out.println("Total time for dedupentity creation : " + dedupentity_time);
|
||||||
|
|
||||||
FileUtils.deleteDirectory(new File(workingPath));
|
// FileUtils.deleteDirectory(new File(workingPath));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test //test the match between two JSON
|
@Test //test the match between two JSON
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
"threshold" : "0.99",
|
"threshold" : "0.99",
|
||||||
"dedupRun" : "001",
|
"dedupRun" : "001",
|
||||||
"entityType" : "datasource",
|
"entityType" : "datasource",
|
||||||
"orderField" : "name",
|
"orderField" : "officialname",
|
||||||
"queueMaxSize" : "2000",
|
"queueMaxSize" : "2000",
|
||||||
"groupMaxSize" : "50",
|
"groupMaxSize" : "50",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
|
@ -14,8 +14,8 @@
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
|
||||||
{ "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } },
|
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
|
||||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
||||||
],
|
],
|
||||||
"decisionTree" : {
|
"decisionTree" : {
|
||||||
|
@ -39,16 +39,36 @@
|
||||||
"layer2": {
|
"layer2": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "name",
|
"field": "officialname",
|
||||||
"comparator": "levensteinTitle",
|
"comparator": "levensteinTitle",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {
|
"params": {
|
||||||
|
"threshold": 0.9
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "englishname",
|
||||||
|
"comparator": "levensteinTitle",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"threshold": 0.9
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "officialname",
|
||||||
|
"comparator": "levensteinTitle",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"crossCompare": "englishname",
|
||||||
|
"threshold": 0.9
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.9,
|
"threshold": 0.9,
|
||||||
"aggregation": "AVG",
|
"aggregation": "MAX",
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "NO_MATCH",
|
"undefined": "NO_MATCH",
|
||||||
|
@ -56,7 +76,8 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"model" : [
|
"model" : [
|
||||||
{ "name" : "name", "type" : "String", "path" : "$.name" },
|
{ "name" : "englishname", "type" : "String", "path" : "$.englishname" },
|
||||||
|
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
|
||||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
|
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
|
||||||
],
|
],
|
||||||
"blacklists" : {
|
"blacklists" : {
|
||||||
|
|
Loading…
Reference in New Issue