From 6c47fb0e67b388c815566ac0e78d307408aa47e4 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 19 Apr 2022 10:18:09 +0200 Subject: [PATCH] implementation of comparators and clustering function for the author deduplication --- dnet-dedup-test/job-override.properties | 16 ++- .../java/eu/dnetlib/pace/DedupLocalTest.java | 5 +- .../dnetlib/pace/config/auth.tree.conf.json | 102 ++++++++++++++++++ .../dnetlib/pace/clustering/PersonHash.java | 2 +- .../dnetlib/pace/tree/NumbersComparator.java | 34 ++++++ .../pace/tree/support/TreeNodeDef.java | 5 +- .../clustering/ClusteringFunctionTest.java | 30 +++++- .../pace/comparators/ComparatorTest.java | 5 + .../eu/dnetlib/pace/config/ConfigTest.java | 15 ++- .../eu/dnetlib/pace/config/author.json | 1 + .../dnetlib/pace/config/author.test.conf.json | 102 ++++++++++++++++++ 11 files changed, 305 insertions(+), 12 deletions(-) create mode 100644 dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/auth.tree.conf.json create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json diff --git a/dnet-dedup-test/job-override.properties b/dnet-dedup-test/job-override.properties index 7c0dc50..309e615 100644 --- a/dnet-dedup-test/job-override.properties +++ b/dnet-dedup-test/job-override.properties @@ -1,6 +1,12 @@ #entitiesPath = /tmp/publications_test_dump -entitiesPath = /user/michele.debonis/raw_graph_for_testing/publication -workingPath = /user/michele.debonis/new_dedup_test/workingdirtree -dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json -numPartitions = 8000 -useTree = false \ No newline at end of file +#entitiesPath = /user/michele.debonis/raw_graph_for_testing/publication +#workingPath = /user/michele.debonis/new_dedup_test/workingdirtree +#dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json +#numPartitions = 8000 +#useTree = false + +useTree = true +numPartitions = 1 +dedupConfPath = /user/michele.debonis/authors_dedup_test/auth.tree.conf.json +workingPath = /user/michele.debonis/authors_dedup_test/workingdir +entitiesPath = /user/michele.debonis/authors_dedup_test/authors-scad-zbmath-1.json \ No newline at end of file diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java index 12087d9..02a2879 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java @@ -23,7 +23,6 @@ import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.*; import org.junit.jupiter.api.extension.ExtendWith; @@ -31,8 +30,6 @@ import org.mockito.junit.jupiter.MockitoExtension; import scala.Tuple2; import java.awt.*; -import java.awt.event.WindowAdapter; -import java.awt.event.WindowEvent; import java.io.BufferedReader; import java.io.File; import java.io.IOException; @@ -184,7 +181,9 @@ public class DedupLocalTest extends DedupTestUtils { DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS( Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json").toURI()).toFile().getAbsolutePath() )); + String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath(); + String simRelsPath = workingPath + "/simrels"; String mergeRelsPath = workingPath + "/mergerels"; String outputPath = workingPath + "/dedup"; diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/auth.tree.conf.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/auth.tree.conf.json new file mode 100644 index 0000000..0ac29f8 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/config/auth.tree.conf.json @@ -0,0 +1,102 @@ +{ + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "author", + "subEntityType": "author", + "subEntityValue": "author", + "orderField": "fullname", + "queueMaxSize": "200", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "50", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" + }, + "pace": { + "clustering" : [ + { "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} }, + { "name" : "personHash", "fields" : [ "fullname" ], "params" : {} } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "year", + "comparator": "numbersComparator", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 50, + "aggregation": "MAX", + "positive": "NO_MATCH", + "negative": "surnames", + "undefined": "surnames", + "ignoreUndefined": "true" + }, + "surnames": { + "fields": [ + { + "field": "coauthors", + "comparator": "authorsMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "surname_th": 0.75, + "fullname_th": 0.75, + "size_th": 20, + "mode": "surname" + } + } + ], + "threshold": 0.6, + "aggregation": "MAX", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "fullname", + "type": "String", + "path": "$.name" + }, + { + "name": "coauthors", + "type": "List", + "path": "$.coauthors[*].name", + "size": 200 + }, + { + "name": "year", + "type": "String", + "path": "$.publication.year" + }, + { + "name": "title", + "type": "String", + "path": "$.publication.title" + } + ], + "blacklists": {}, + "synonyms": {} + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index 2020a66..f6c4fe0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -9,7 +9,7 @@ import com.google.common.collect.Lists; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Person; -@ClusteringClass("personhash") +@ClusteringClass("personHash") public class PersonHash extends AbstractClusteringFunction { private boolean DEFAULT_AGGRESSIVE = false; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java new file mode 100644 index 0000000..ac6d784 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java @@ -0,0 +1,34 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("numbersComparator") +public class NumbersComparator extends AbstractComparator { + + Map params; + + public NumbersComparator(Map params) { + super(params); + this.params = params; + } + + @Override + public double distance(String a, String b, Config conf) { + + //extracts numbers from the field + String numbers1 = getNumbers(nfd(a)); + String numbers2 = getNumbers(nfd(b)); + + if (numbers1.isEmpty() || numbers2.isEmpty()) + return -1.0; + + int n1 = Integer.parseInt(numbers1); + int n2 = Integer.parseInt(numbers2); + + return Math.abs(n1 - n2); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 43b3a92..f7ebe96 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.tree.support; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.PaceConfig; @@ -9,6 +10,7 @@ import org.apache.commons.lang3.StringUtils; import java.io.IOException; import java.io.Serializable; +import java.io.StringWriter; import java.util.List; public class TreeNodeDef implements Serializable { @@ -57,8 +59,9 @@ public class TreeNodeDef implements Serializable { double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf); result = Math.max(result1,result2); } - else + else { result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); + } stats.addFieldStats( fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 0487e5b..8d41a37 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -200,4 +200,32 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } -} + @Test + public void testPersonClustering(){ + + final ClusteringFunction cf = new PersonClustering(params); + final String s = "Abd-Alla, Abo-el-nour N."; + System.out.println("s = " + s); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + + final String s1 = "Manghi, Paolo"; + System.out.println("s1 = " + s1); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); + + } + + @Test + public void testPersonHash(){ + + final ClusteringFunction cf = new PersonHash(params); + final String s = "Manghi, Paolo"; + System.out.println("s = " + s); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); + + final String s1 = "Manghi, P."; + System.out.println("s = " + s1); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); + + } + +} \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index f4eee93..749802f 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -9,6 +9,7 @@ import eu.dnetlib.pace.config.DedupConfig; import org.junit.jupiter.api.*; import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; @@ -246,6 +247,10 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(0.25, result); + Field f = createFieldList(new ArrayList<>(), "authors"); + result = authorsMatch.compare(f,f, conf); + System.out.println("result = " + result); + } @Test diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 879e572..56d8530 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -85,7 +85,7 @@ public class ConfigTest extends AbstractPaceTest { } @Test - public void asMapDocumentTest() { + public void asMapDocumentTest1() { DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); @@ -103,6 +103,19 @@ public class ConfigTest extends AbstractPaceTest { } + @Test + public void asMapDocumentTest2() { + + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.test.conf.json")); + + final String json = readFromClasspath("author.json"); + + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); + + System.out.println("mapDocument = " + mapDocument.getFieldMap().get("coauthors").stringValue()); + + } + @Test public void testJPath() { final String json = readFromClasspath("organization.json"); diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json new file mode 100644 index 0000000..62c6e91 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.json @@ -0,0 +1 @@ +{"id": "f3389e7c8af1d806c06e2ab51f28a4b4", "name": "Aczél, János", "shortname": "Aczél, J.", "pid": "aczel.janos", "coauthors": [], "publication": {"year": "1955", "title": "L\\\"osung der Vektor-Funktionalgleichung der homogenen und inhomogenen $n$-dimensionalen einparametrigen ``Translation'' der erzeugenden Funktion von Kettenreaktionen und des station\\\"aren und nichtstation\\\"aren Bewegungsintegrals", "venue": "Acta Math. Acad. Sci. Hung. 6, 131-140 (1955)."}} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json new file mode 100644 index 0000000..0ac29f8 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/author.test.conf.json @@ -0,0 +1,102 @@ +{ + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "author", + "subEntityType": "author", + "subEntityValue": "author", + "orderField": "fullname", + "queueMaxSize": "200", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "50", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" + }, + "pace": { + "clustering" : [ + { "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} }, + { "name" : "personHash", "fields" : [ "fullname" ], "params" : {} } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "year", + "comparator": "numbersComparator", + "weight": 1, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 50, + "aggregation": "MAX", + "positive": "NO_MATCH", + "negative": "surnames", + "undefined": "surnames", + "ignoreUndefined": "true" + }, + "surnames": { + "fields": [ + { + "field": "coauthors", + "comparator": "authorsMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "surname_th": 0.75, + "fullname_th": 0.75, + "size_th": 20, + "mode": "surname" + } + } + ], + "threshold": 0.6, + "aggregation": "MAX", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "fullname", + "type": "String", + "path": "$.name" + }, + { + "name": "coauthors", + "type": "List", + "path": "$.coauthors[*].name", + "size": 200 + }, + { + "name": "year", + "type": "String", + "path": "$.publication.year" + }, + { + "name": "title", + "type": "String", + "path": "$.publication.title" + } + ], + "blacklists": {}, + "synonyms": {} + } +} \ No newline at end of file