diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java index 76ab518..a380b7f 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java @@ -183,7 +183,7 @@ public class DedupLocalTest extends DedupTestUtils { //custom parameters for this test DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS( - Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI()).toFile().getAbsolutePath() + Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/ds.tree.conf.json").toURI()).toFile().getAbsolutePath() )); String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI()).toFile().getAbsolutePath(); @@ -236,7 +236,7 @@ public class DedupLocalTest extends DedupTestUtils { System.out.println("Total time for mergerels creation : " + mergerels_time); System.out.println("Total time for dedupentity creation : " + dedupentity_time); - FileUtils.deleteDirectory(new File(workingPath)); +// FileUtils.deleteDirectory(new File(workingPath)); } @Test //test the match between two JSON diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json index 83ee8b1..e631131 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json @@ -3,7 +3,7 @@ "threshold" : "0.99", "dedupRun" : "001", "entityType" : "datasource", - "orderField" : "name", + "orderField" : "officialname", "queueMaxSize" : "2000", "groupMaxSize" : "50", "slidingWindowSize" : "200", @@ -14,8 +14,8 @@ }, "pace" : { "clustering" : [ - { "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} }, + { "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } }, { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } ], "decisionTree" : { @@ -39,16 +39,36 @@ "layer2": { "fields": [ { - "field": "name", + "field": "officialname", "comparator": "levensteinTitle", "weight": 1.0, "countIfUndefined": "true", "params": { + "threshold": 0.9 + } + }, + { + "field": "englishname", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "threshold": 0.9 + } + }, + { + "field": "officialname", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": { + "crossCompare": "englishname", + "threshold": 0.9 } } ], "threshold": 0.9, - "aggregation": "AVG", + "aggregation": "MAX", "positive": "MATCH", "negative": "NO_MATCH", "undefined": "NO_MATCH", @@ -56,7 +76,8 @@ } }, "model" : [ - { "name" : "name", "type" : "String", "path" : "$.name" }, + { "name" : "englishname", "type" : "String", "path" : "$.englishname" }, + { "name" : "officialname", "type" : "String", "path" : "$.officialname" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" } ], "blacklists" : {