diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json index e631131..976a448 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/ds.tree.conf.json @@ -3,7 +3,7 @@ "threshold" : "0.99", "dedupRun" : "001", "entityType" : "datasource", - "orderField" : "officialname", + "orderField" : "englishname", "queueMaxSize" : "2000", "groupMaxSize" : "50", "slidingWindowSize" : "200", @@ -16,6 +16,7 @@ "clustering" : [ { "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} }, { "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } }, + {"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}}, { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } ], "decisionTree" : { @@ -80,9 +81,7 @@ { "name" : "officialname", "type" : "String", "path" : "$.officialname" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" } ], - "blacklists" : { - "legalname" : [] - }, + "blacklists" : {}, "synonyms": {} } } \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pubs.fdup.exp.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pubs.fdup.exp.json index 9fe1faf..4883bd5 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pubs.fdup.exp.json +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/pubs.fdup.exp.json @@ -9,7 +9,7 @@ "queueMaxSize": "200", "groupMaxSize": "100", "maxChildren": "100", - "slidingWindowSize": "50", + "slidingWindowSize": "100", "rootBuilder": [ "result", "resultProject_outcome_isProducedBy", @@ -29,7 +29,7 @@ }, "pace": { "clustering" : [ - { "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} } ], "decisionTree": { @@ -42,18 +42,36 @@ "countIfUndefined": "false", "params": { "jpath_value": "$.value", - "jpath_classid": "$.qualifier.classid" + "jpath_classid": "$.qualifier.classid", + "mode": "count" } } ], - "threshold": 0.5, + "threshold": 1.0, "aggregation": "MAX", - "positive": "softCheck", - "negative": "hardCheck1", - "undefined": "hardCheck1", + "positive": "MATCH", + "negative": "versionCheck", + "undefined": "versionCheck", "ignoreUndefined": "true" }, - "softCheck": { + "versionCheck": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "MAX", + "positive": "titleCheck", + "negative": "NO_MATCH", + "undefined": "titleCheck", + "ignoreUndefined": "false" + }, + "titleCheck": { "fields": [ { "field": "title", @@ -64,52 +82,28 @@ } ], "threshold": 0.9, - "aggregation": "AVG", - "positive": "MATCH", + "aggregation": "MAX", + "positive": "authorsCheck", "negative": "NO_MATCH", "undefined": "NO_MATCH", "ignoreUndefined": "true" }, - "hardCheck1": { + "authorsCheck": { "fields": [ - { - "field": "title", - "comparator": "titleVersionMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": {} - }, { "field": "authors", - "comparator": "sizeMatch", + "comparator": "authorsMatch", "weight": 1.0, "countIfUndefined": "false", "params": {} } ], - "threshold": 1.0, - "aggregation": "AND", - "positive": "hardCheck2", - "negative": "NO_MATCH", - "undefined": "hardCheck2", - "ignoreUndefined": "false" - }, - "hardCheck2": { - "fields": [ - { - "field": "title", - "comparator": "levensteinTitle", - "weight": 1.0, - "countIfUndefined": "true", - "params": {} - } - ], - "threshold": 0.99, - "aggregation": "AVG", + "threshold": 0.6, + "aggregation": "MAX", "positive": "MATCH", "negative": "NO_MATCH", - "undefined": "NO_MATCH", - "ignoreUndefined": "true" + "undefined": "MATCH", + "ignoreUndefined": "false" } }, "model": [ diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/ds.to.fix.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/ds.to.fix.json new file mode 100644 index 0000000..fe6ec0b --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/ds.to.fix.json @@ -0,0 +1,4 @@ +{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"} +{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"} +{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"} +{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java index e2eb0cd..958028e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java @@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase { @Override protected String getValue(final Field f) { + try { return asUrl(super.getValue(f)).getHost(); } catch (MalformedURLException e) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java index e3ce6a9..174c5c1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java @@ -161,11 +161,14 @@ public class BlockProcessorForTesting { } else { //use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications) - if(useTree) + if (useTree) emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); else emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); } +// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) { +// emitOutput(true, idPivot, idCurr, context); +// } } } @@ -180,44 +183,45 @@ public class BlockProcessorForTesting { return compare>=1.0; } - private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { + private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { + //if the score gives 1, the publications are equivalent + Map params = new HashMap<>(); + params.put("jpath_value", "$.value"); + params.put("jpath_classid", "$.qualifier.classid"); + params.put("mode", "count"); - boolean startLayer = false; - boolean hardcheck1Layer = false; + double score = 0.0; - //START - comparison of the PIDs json lists - Map params = new HashMap<>(); - params.put("jpath_value", "$.value"); - params.put("jpath_classid", "$.qualifier.classid"); - JsonListMatch jsonListMatch = new JsonListMatch(params); - double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config); - if (result >= 0.5) //if the result of the comparison is greater than the threshold - startLayer = true; - - //HARDCHECK1 - comparison of title versions and authors size - TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); - double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); - SizeMatch sizeMatch = new SizeMatch(params); - double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); - if (Math.min(result1, result2) != 0) - hardcheck1Layer = true; - - //SOFTCHECK and HARDCHECK2 - comparison of the titles - LevensteinTitle levensteinTitle = new LevensteinTitle(params); - double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); - double titleScore = Double.isNaN(result3)?0.0:result3; - - if (startLayer) { - return titleScore >= 0.90; - } - else { - if (hardcheck1Layer) { - return titleScore >= 0.99; - } - } - return false; + //levenstein title + LevensteinTitle levensteinTitle = new LevensteinTitle(params); + if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) { + score += 0.2; } + //pid + JsonListMatch jsonListMatch = new JsonListMatch(params); + if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) { + score += 0.5; + } + + //title version + TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); + double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); + if(result1<0 || result1>=1.0) { + score += 0.1; + } + + //authors match + params.remove("mode"); + AuthorsMatch authorsMatch = new AuthorsMatch(params); + double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); + if(result2 <0|| result2>=0.6) { + score += 0.2; + } + + return score>=0.5; + } + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { if (result) { diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 8d41a37..b84d48d 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + + params.put("len", 3); + params.put("max", 1); + + System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication")))); } @Test diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 749802f..b79305b 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -272,5 +272,17 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(1.0, result); } + @Test + public void domainExactMatch() { + + DomainExactMatch domainExactMatch = new DomainExactMatch(params); + Field a = url("http://www.flowrepository.org"); + Field b = url("http://flowrepository.org/"); + + double compare = domainExactMatch.compare(a, b, conf); + System.out.println("compare = " + compare); + + } + }