From 9ddd24ba363a4cc6c6f3373f8466c69bfed88e31 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 19 Apr 2022 10:18:09 +0200 Subject: [PATCH] implementation of comparators and clustering function for the author deduplication --- .../dnetlib/pace/clustering/PersonHash.java | 2 +- .../dnetlib/pace/tree/NumbersComparator.java | 34 ++++++ .../pace/tree/support/TreeNodeDef.java | 5 +- .../clustering/ClusteringFunctionTest.java | 30 +++++- .../pace/comparators/ComparatorTest.java | 5 + .../eu/dnetlib/pace/config/ConfigTest.java | 15 ++- .../eu/dnetlib/pace/config/author.json | 1 + .../dnetlib/pace/config/author.test.conf.json | 102 ++++++++++++++++++ 8 files changed, 190 insertions(+), 4 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index 2020a662f..f6c4fe07f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -9,7 +9,7 @@ import com.google.common.collect.Lists; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Person; -@ClusteringClass("personhash") +@ClusteringClass("personHash") public class PersonHash extends AbstractClusteringFunction { private boolean DEFAULT_AGGRESSIVE = false; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java new file mode 100644 index 000000000..ac6d78403 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java @@ -0,0 +1,34 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("numbersComparator") +public class NumbersComparator extends AbstractComparator { + + Map params; + + public NumbersComparator(Map params) { + super(params); + this.params = params; + } + + @Override + public double distance(String a, String b, Config conf) { + + //extracts numbers from the field + String numbers1 = getNumbers(nfd(a)); + String numbers2 = getNumbers(nfd(b)); + + if (numbers1.isEmpty() || numbers2.isEmpty()) + return -1.0; + + int n1 = Integer.parseInt(numbers1); + int n2 = Integer.parseInt(numbers2); + + return Math.abs(n1 - n2); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 43b3a9276..f7ebe96d1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.tree.support; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.PaceConfig; @@ -9,6 +10,7 @@ import org.apache.commons.lang3.StringUtils; import java.io.IOException; import java.io.Serializable; +import java.io.StringWriter; import java.util.List; public class TreeNodeDef implements Serializable { @@ -57,8 +59,9 @@ public class TreeNodeDef implements Serializable { double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf); result = Math.max(result1,result2); } - else + else { result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); + } stats.addFieldStats( fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 0487e5b16..8d41a3760 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -200,4 +200,32 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } -} + @Test + public void testPersonClustering(){ + + final ClusteringFunction cf = new PersonClustering(params); + final String s = "Abd-Alla, Abo-el-nour N."; + System.out.println("s = " + s); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + + final String s1 = "Manghi, Paolo"; + System.out.println("s1 = " + s1); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); + + } + + @Test + public void testPersonHash(){ + + final ClusteringFunction cf = new PersonHash(params); + final String s = "Manghi, Paolo"; + System.out.println("s = " + s); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + + final String s1 = "Manghi, P."; + System.out.println("s = " + s1); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); + + } + +} \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index f4eee93e0..749802f0d 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -9,6 +9,7 @@ import eu.dnetlib.pace.config.DedupConfig; import org.junit.jupiter.api.*; import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -246,6 +247,10 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(0.25, result); + Field f = createFieldList(new ArrayList<>(), "authors"); + result = authorsMatch.compare(f,f, conf); + System.out.println("result = " + result); + } @Test diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 879e5724c..56d8530be 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -85,7 +85,7 @@ public class ConfigTest extends AbstractPaceTest { } @Test - public void asMapDocumentTest() { + public void asMapDocumentTest1() { DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); @@ -103,6 +103,19 @@ public class ConfigTest extends AbstractPaceTest { } + @Test + public void asMapDocumentTest2() { + + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.test.conf.json")); + + final String json = readFromClasspath("author.json"); + + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); + + System.out.println("mapDocument = " + mapDocument.getFieldMap().get("coauthors").stringValue()); + + } + @Test public void testJPath() { final String json = readFromClasspath("organization.json"); diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json new file mode 100644 index 000000000..62c6e9185 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json @@ -0,0 +1 @@ +{"id": "f3389e7c8af1d806c06e2ab51f28a4b4", "name": "Aczél, János", "shortname": "Aczél, J.", "pid": "aczel.janos", "coauthors": [], "publication": {"year": "1955", "title": "L\\\"osung der Vektor-Funktionalgleichung der homogenen und inhomogenen $n$-dimensionalen einparametrigen ``Translation'' der erzeugenden Funktion von Kettenreaktionen und des station\\\"aren und nichtstation\\\"aren Bewegungsintegrals", "venue": "Acta Math. Acad. Sci. Hung. 6, 131-140 (1955)."}} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json new file mode 100644 index 000000000..0ac29f875 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json @@ -0,0 +1,102 @@ +{ + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "author", + "subEntityType": "author", + "subEntityValue": "author", + "orderField": "fullname", + "queueMaxSize": "200", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "50", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" + }, + "pace": { + "clustering" : [ + { "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} }, + { "name" : "personHash", "fields" : [ "fullname" ], "params" : {} } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "year", + "comparator": "numbersComparator", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 50, + "aggregation": "MAX", + "positive": "NO_MATCH", + "negative": "surnames", + "undefined": "surnames", + "ignoreUndefined": "true" + }, + "surnames": { + "fields": [ + { + "field": "coauthors", + "comparator": "authorsMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "surname_th": 0.75, + "fullname_th": 0.75, + "size_th": 20, + "mode": "surname" + } + } + ], + "threshold": 0.6, + "aggregation": "MAX", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "fullname", + "type": "String", + "path": "$.name" + }, + { + "name": "coauthors", + "type": "List", + "path": "$.coauthors[*].name", + "size": 200 + }, + { + "name": "year", + "type": "String", + "path": "$.publication.year" + }, + { + "name": "title", + "type": "String", + "path": "$.publication.title" + } + ], + "blacklists": {}, + "synonyms": {} + } +} \ No newline at end of file