implementation of comparators and clustering function for the author deduplication
This commit is contained in:
parent
9618e889bd
commit
6c47fb0e67
|
@ -1,6 +1,12 @@
|
|||
#entitiesPath = /tmp/publications_test_dump
|
||||
entitiesPath = /user/michele.debonis/raw_graph_for_testing/publication
|
||||
workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
|
||||
dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
|
||||
numPartitions = 8000
|
||||
useTree = false
|
||||
#entitiesPath = /user/michele.debonis/raw_graph_for_testing/publication
|
||||
#workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
|
||||
#dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
|
||||
#numPartitions = 8000
|
||||
#useTree = false
|
||||
|
||||
useTree = true
|
||||
numPartitions = 1
|
||||
dedupConfPath = /user/michele.debonis/authors_dedup_test/auth.tree.conf.json
|
||||
workingPath = /user/michele.debonis/authors_dedup_test/workingdir
|
||||
entitiesPath = /user/michele.debonis/authors_dedup_test/authors-scad-zbmath-1.json
|
|
@ -23,7 +23,6 @@ import org.apache.spark.api.java.function.ForeachFunction;
|
|||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
|
@ -31,8 +30,6 @@ import org.mockito.junit.jupiter.MockitoExtension;
|
|||
import scala.Tuple2;
|
||||
|
||||
import java.awt.*;
|
||||
import java.awt.event.WindowAdapter;
|
||||
import java.awt.event.WindowEvent;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
@ -184,7 +181,9 @@ public class DedupLocalTest extends DedupTestUtils {
|
|||
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
||||
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json").toURI()).toFile().getAbsolutePath()
|
||||
));
|
||||
|
||||
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath();
|
||||
|
||||
String simRelsPath = workingPath + "/simrels";
|
||||
String mergeRelsPath = workingPath + "/mergerels";
|
||||
String outputPath = workingPath + "/dedup";
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
{
|
||||
"wf": {
|
||||
"threshold": "0.99",
|
||||
"dedupRun": "001",
|
||||
"entityType": "author",
|
||||
"subEntityType": "author",
|
||||
"subEntityValue": "author",
|
||||
"orderField": "fullname",
|
||||
"queueMaxSize": "200",
|
||||
"groupMaxSize": "100",
|
||||
"maxChildren": "100",
|
||||
"slidingWindowSize": "50",
|
||||
"rootBuilder": [
|
||||
"result",
|
||||
"resultProject_outcome_isProducedBy",
|
||||
"resultResult_publicationDataset_isRelatedTo",
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments",
|
||||
"resultResult_similarity_hasAmongTopNSimilarDocuments",
|
||||
"resultOrganization_affiliation_isAffiliatedWith",
|
||||
"resultResult_part_hasPart",
|
||||
"resultResult_part_isPartOf",
|
||||
"resultResult_supplement_isSupplementTo",
|
||||
"resultResult_supplement_isSupplementedBy",
|
||||
"resultResult_version_isVersionOf"
|
||||
],
|
||||
"includeChildren": "true",
|
||||
"maxIterations": 20,
|
||||
"idPath": "$.id"
|
||||
},
|
||||
"pace": {
|
||||
"clustering" : [
|
||||
{ "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
|
||||
{ "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "year",
|
||||
"comparator": "numbersComparator",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 50,
|
||||
"aggregation": "MAX",
|
||||
"positive": "NO_MATCH",
|
||||
"negative": "surnames",
|
||||
"undefined": "surnames",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"surnames": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "coauthors",
|
||||
"comparator": "authorsMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"surname_th": 0.75,
|
||||
"fullname_th": 0.75,
|
||||
"size_th": 20,
|
||||
"mode": "surname"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.6,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "fullname",
|
||||
"type": "String",
|
||||
"path": "$.name"
|
||||
},
|
||||
{
|
||||
"name": "coauthors",
|
||||
"type": "List",
|
||||
"path": "$.coauthors[*].name",
|
||||
"size": 200
|
||||
},
|
||||
{
|
||||
"name": "year",
|
||||
"type": "String",
|
||||
"path": "$.publication.year"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"type": "String",
|
||||
"path": "$.publication.title"
|
||||
}
|
||||
],
|
||||
"blacklists": {},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
|
@ -9,7 +9,7 @@ import com.google.common.collect.Lists;
|
|||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
|
||||
@ClusteringClass("personhash")
|
||||
@ClusteringClass("personHash")
|
||||
public class PersonHash extends AbstractClusteringFunction {
|
||||
|
||||
private boolean DEFAULT_AGGRESSIVE = false;
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("numbersComparator")
|
||||
public class NumbersComparator extends AbstractComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
|
||||
public NumbersComparator(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
//extracts numbers from the field
|
||||
String numbers1 = getNumbers(nfd(a));
|
||||
String numbers2 = getNumbers(nfd(b));
|
||||
|
||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
int n1 = Integer.parseInt(numbers1);
|
||||
int n2 = Integer.parseInt(numbers2);
|
||||
|
||||
return Math.abs(n1 - n2);
|
||||
}
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
|
@ -9,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.io.StringWriter;
|
||||
import java.util.List;
|
||||
|
||||
public class TreeNodeDef implements Serializable {
|
||||
|
@ -57,8 +59,9 @@ public class TreeNodeDef implements Serializable {
|
|||
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
result = Math.max(result1,result2);
|
||||
}
|
||||
else
|
||||
else {
|
||||
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
}
|
||||
|
||||
stats.addFieldStats(
|
||||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||
|
|
|
@ -200,4 +200,32 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPersonClustering(){
|
||||
|
||||
final ClusteringFunction cf = new PersonClustering(params);
|
||||
final String s = "Abd-Alla, Abo-el-nour N.";
|
||||
System.out.println("s = " + s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
||||
|
||||
final String s1 = "Manghi, Paolo";
|
||||
System.out.println("s1 = " + s1);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPersonHash(){
|
||||
|
||||
final ClusteringFunction cf = new PersonHash(params);
|
||||
final String s = "Manghi, Paolo";
|
||||
System.out.println("s = " + s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
||||
|
||||
final String s1 = "Manghi, P.";
|
||||
System.out.println("s = " + s1);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -9,6 +9,7 @@ import eu.dnetlib.pace.config.DedupConfig;
|
|||
import org.junit.jupiter.api.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
@ -246,6 +247,10 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
assertEquals(0.25, result);
|
||||
|
||||
Field f = createFieldList(new ArrayList<>(), "authors");
|
||||
result = authorsMatch.compare(f,f, conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -85,7 +85,7 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void asMapDocumentTest() {
|
||||
public void asMapDocumentTest1() {
|
||||
|
||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
||||
|
||||
|
@ -103,6 +103,19 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void asMapDocumentTest2() {
|
||||
|
||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.test.conf.json"));
|
||||
|
||||
final String json = readFromClasspath("author.json");
|
||||
|
||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||
|
||||
System.out.println("mapDocument = " + mapDocument.getFieldMap().get("coauthors").stringValue());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJPath() {
|
||||
final String json = readFromClasspath("organization.json");
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{"id": "f3389e7c8af1d806c06e2ab51f28a4b4", "name": "Aczél, János", "shortname": "Aczél, J.", "pid": "aczel.janos", "coauthors": [], "publication": {"year": "1955", "title": "L\\\"osung der Vektor-Funktionalgleichung der homogenen und inhomogenen $n$-dimensionalen einparametrigen ``Translation'' der erzeugenden Funktion von Kettenreaktionen und des station\\\"aren und nichtstation\\\"aren Bewegungsintegrals", "venue": "Acta Math. Acad. Sci. Hung. 6, 131-140 (1955)."}}
|
|
@ -0,0 +1,102 @@
|
|||
{
|
||||
"wf": {
|
||||
"threshold": "0.99",
|
||||
"dedupRun": "001",
|
||||
"entityType": "author",
|
||||
"subEntityType": "author",
|
||||
"subEntityValue": "author",
|
||||
"orderField": "fullname",
|
||||
"queueMaxSize": "200",
|
||||
"groupMaxSize": "100",
|
||||
"maxChildren": "100",
|
||||
"slidingWindowSize": "50",
|
||||
"rootBuilder": [
|
||||
"result",
|
||||
"resultProject_outcome_isProducedBy",
|
||||
"resultResult_publicationDataset_isRelatedTo",
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments",
|
||||
"resultResult_similarity_hasAmongTopNSimilarDocuments",
|
||||
"resultOrganization_affiliation_isAffiliatedWith",
|
||||
"resultResult_part_hasPart",
|
||||
"resultResult_part_isPartOf",
|
||||
"resultResult_supplement_isSupplementTo",
|
||||
"resultResult_supplement_isSupplementedBy",
|
||||
"resultResult_version_isVersionOf"
|
||||
],
|
||||
"includeChildren": "true",
|
||||
"maxIterations": 20,
|
||||
"idPath": "$.id"
|
||||
},
|
||||
"pace": {
|
||||
"clustering" : [
|
||||
{ "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
|
||||
{ "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "year",
|
||||
"comparator": "numbersComparator",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 50,
|
||||
"aggregation": "MAX",
|
||||
"positive": "NO_MATCH",
|
||||
"negative": "surnames",
|
||||
"undefined": "surnames",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"surnames": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "coauthors",
|
||||
"comparator": "authorsMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"surname_th": 0.75,
|
||||
"fullname_th": 0.75,
|
||||
"size_th": 20,
|
||||
"mode": "surname"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.6,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "fullname",
|
||||
"type": "String",
|
||||
"path": "$.name"
|
||||
},
|
||||
{
|
||||
"name": "coauthors",
|
||||
"type": "List",
|
||||
"path": "$.coauthors[*].name",
|
||||
"size": 200
|
||||
},
|
||||
{
|
||||
"name": "year",
|
||||
"type": "String",
|
||||
"path": "$.publication.year"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"type": "String",
|
||||
"path": "$.publication.title"
|
||||
}
|
||||
],
|
||||
"blacklists": {},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue