implementation of comparators and clustering function for the author deduplication

This commit is contained in:
miconis 2022-04-19 10:18:09 +02:00
parent 9618e889bd
commit 6c47fb0e67
11 changed files with 305 additions and 12 deletions

View File

@ -1,6 +1,12 @@
#entitiesPath = /tmp/publications_test_dump
entitiesPath = /user/michele.debonis/raw_graph_for_testing/publication
workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
numPartitions = 8000
useTree = false
#entitiesPath = /user/michele.debonis/raw_graph_for_testing/publication
#workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
#dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
#numPartitions = 8000
#useTree = false
useTree = true
numPartitions = 1
dedupConfPath = /user/michele.debonis/authors_dedup_test/auth.tree.conf.json
workingPath = /user/michele.debonis/authors_dedup_test/workingdir
entitiesPath = /user/michele.debonis/authors_dedup_test/authors-scad-zbmath-1.json

View File

@ -23,7 +23,6 @@ import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
@ -31,8 +30,6 @@ import org.mockito.junit.jupiter.MockitoExtension;
import scala.Tuple2;
import java.awt.*;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
@ -184,7 +181,9 @@ public class DedupLocalTest extends DedupTestUtils {
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json").toURI()).toFile().getAbsolutePath()
));
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath();
String simRelsPath = workingPath + "/simrels";
String mergeRelsPath = workingPath + "/mergerels";
String outputPath = workingPath + "/dedup";

View File

@ -0,0 +1,102 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
{ "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "year",
"comparator": "numbersComparator",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 50,
"aggregation": "MAX",
"positive": "NO_MATCH",
"negative": "surnames",
"undefined": "surnames",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"size_th": 20,
"mode": "surname"
}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "fullname",
"type": "String",
"path": "$.name"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coauthors[*].name",
"size": 200
},
{
"name": "year",
"type": "String",
"path": "$.publication.year"
},
{
"name": "title",
"type": "String",
"path": "$.publication.title"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -9,7 +9,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personhash")
@ClusteringClass("personHash")
public class PersonHash extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = false;

View File

@ -0,0 +1,34 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersComparator")
public class NumbersComparator extends AbstractComparator {
Map<String, String> params;
public NumbersComparator(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(String a, String b, Config conf) {
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
int n1 = Integer.parseInt(numbers1);
int n2 = Integer.parseInt(numbers2);
return Math.abs(n1 - n2);
}
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
@ -9,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.Serializable;
import java.io.StringWriter;
import java.util.List;
public class TreeNodeDef implements Serializable {
@ -57,8 +59,9 @@ public class TreeNodeDef implements Serializable {
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
result = Math.max(result1,result2);
}
else
else {
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
}
stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),

View File

@ -200,4 +200,32 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
}
}
@Test
public void testPersonClustering(){
final ClusteringFunction cf = new PersonClustering(params);
final String s = "Abd-Alla, Abo-el-nour N.";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "Manghi, Paolo";
System.out.println("s1 = " + s1);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
}
@Test
public void testPersonHash(){
final ClusteringFunction cf = new PersonHash(params);
final String s = "Manghi, Paolo";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "Manghi, P.";
System.out.println("s = " + s1);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
}
}

View File

@ -9,6 +9,7 @@ import eu.dnetlib.pace.config.DedupConfig;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
@ -246,6 +247,10 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.25, result);
Field f = createFieldList(new ArrayList<>(), "authors");
result = authorsMatch.compare(f,f, conf);
System.out.println("result = " + result);
}
@Test

View File

@ -85,7 +85,7 @@ public class ConfigTest extends AbstractPaceTest {
}
@Test
public void asMapDocumentTest() {
public void asMapDocumentTest1() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
@ -103,6 +103,19 @@ public class ConfigTest extends AbstractPaceTest {
}
@Test
public void asMapDocumentTest2() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.test.conf.json"));
final String json = readFromClasspath("author.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + mapDocument.getFieldMap().get("coauthors").stringValue());
}
@Test
public void testJPath() {
final String json = readFromClasspath("organization.json");

View File

@ -0,0 +1 @@
{"id": "f3389e7c8af1d806c06e2ab51f28a4b4", "name": "Aczél, János", "shortname": "Aczél, J.", "pid": "aczel.janos", "coauthors": [], "publication": {"year": "1955", "title": "L\\\"osung der Vektor-Funktionalgleichung der homogenen und inhomogenen $n$-dimensionalen einparametrigen ``Translation'' der erzeugenden Funktion von Kettenreaktionen und des station\\\"aren und nichtstation\\\"aren Bewegungsintegrals", "venue": "Acta Math. Acad. Sci. Hung. 6, 131-140 (1955)."}}

View File

@ -0,0 +1,102 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
{ "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "year",
"comparator": "numbersComparator",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 50,
"aggregation": "MAX",
"positive": "NO_MATCH",
"negative": "surnames",
"undefined": "surnames",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"size_th": 20,
"mode": "surname"
}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "fullname",
"type": "String",
"path": "$.name"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coauthors[*].name",
"size": 200
},
{
"name": "year",
"type": "String",
"path": "$.publication.year"
},
{
"name": "title",
"type": "String",
"path": "$.publication.title"
}
],
"blacklists": {},
"synonyms": {}
}
}