forked from D-Net/dnet-hadoop
implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser
This commit is contained in:
parent
297eb207a5
commit
cb595c87bb
|
@ -1,5 +1,5 @@
|
||||||
package eu.dnetlib.pace.config;
|
package eu.dnetlib.pace.config;
|
||||||
|
|
||||||
public enum Type {
|
public enum Type {
|
||||||
String, Int, List, JSON, URL, StringConcat
|
String, Int, List, JSON, URL, StringConcat, DoubleArray
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,4 +20,6 @@ public interface FieldValue extends Field {
|
||||||
*/
|
*/
|
||||||
public void setValue(final Object value);
|
public void setValue(final Object value);
|
||||||
|
|
||||||
|
public double[] doubleArrayValue();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,8 +58,10 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
|
||||||
throw new RuntimeException(value.toString());
|
throw new RuntimeException(value.toString());
|
||||||
}
|
}
|
||||||
case URL:
|
case URL:
|
||||||
String str = value.toString();
|
String str = value.toString();
|
||||||
return StringUtils.isBlank(str) || !isValidURL(str);
|
return StringUtils.isBlank(str) || !isValidURL(str);
|
||||||
|
case DoubleArray:
|
||||||
|
return doubleArrayValue().length==0;
|
||||||
default:
|
default:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -116,6 +118,10 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public double[] doubleArrayValue() {
|
||||||
|
return (double[])getValue();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* (non-Javadoc)
|
* (non-Javadoc)
|
||||||
*
|
*
|
||||||
|
|
|
@ -52,7 +52,7 @@ public class AuthorsMatch extends AbstractComparator {
|
||||||
if (a.isEmpty() || b.isEmpty())
|
if (a.isEmpty() || b.isEmpty())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) a).size() > SIZE_THRESHOLD)
|
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
|
||||||
return 1.0;
|
return 1.0;
|
||||||
|
|
||||||
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
import eu.dnetlib.pace.model.FieldList;
|
||||||
|
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||||
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ComparatorClass("cosineSimilarity")
|
||||||
|
public class CosineSimilarity extends AbstractComparator {
|
||||||
|
|
||||||
|
Map<String, String> params;
|
||||||
|
|
||||||
|
public CosineSimilarity(Map<String,String> params) {
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double compare(final Field a, final Field b, final Config conf) {
|
||||||
|
|
||||||
|
if (a.isEmpty() || b.isEmpty())
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
|
||||||
|
double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
|
||||||
|
|
||||||
|
return cosineSimilarity(aVector, bVector);
|
||||||
|
}
|
||||||
|
|
||||||
|
double cosineSimilarity(double[] a, double[] b) {
|
||||||
|
double dotProduct = 0;
|
||||||
|
double normASum = 0;
|
||||||
|
double normBSum = 0;
|
||||||
|
|
||||||
|
for(int i = 0; i < a.length; i ++) {
|
||||||
|
dotProduct += a[i] * b[i];
|
||||||
|
normASum += a[i] * a[i];
|
||||||
|
normBSum += b[i] * b[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
|
||||||
|
return dotProduct / eucledianDist;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -19,9 +19,13 @@ public class StringListMatch extends AbstractComparator {
|
||||||
private static final Log log = LogFactory.getLog(StringListMatch.class);
|
private static final Log log = LogFactory.getLog(StringListMatch.class);
|
||||||
private Map<String, String> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
final private String TYPE; //percentage or count
|
||||||
|
|
||||||
public StringListMatch(final Map<String, String> params) {
|
public StringListMatch(final Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
|
|
||||||
|
TYPE = params.getOrDefault("type", "percentage");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -31,7 +35,7 @@ public class StringListMatch extends AbstractComparator {
|
||||||
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
|
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
|
||||||
|
|
||||||
if (pa.isEmpty() || pb.isEmpty()) {
|
if (pa.isEmpty() || pb.isEmpty()) {
|
||||||
return -1; //return undefined if one of the two lists of pids is empty
|
return -1; //return undefined if one of the two lists is empty
|
||||||
}
|
}
|
||||||
|
|
||||||
int incommon = Sets.intersection(pa, pb).size();
|
int incommon = Sets.intersection(pa, pb).size();
|
||||||
|
@ -41,7 +45,10 @@ public class StringListMatch extends AbstractComparator {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (double)incommon / (incommon + simDiff);
|
if(TYPE.equals("percentage"))
|
||||||
|
return (double)incommon / (incommon + simDiff);
|
||||||
|
else
|
||||||
|
return incommon;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -244,6 +244,5 @@ public class BlockProcessorForTesting {
|
||||||
final String type = dedupConf.getWf().getEntityType();
|
final String type = dedupConf.getWf().getEntityType();
|
||||||
|
|
||||||
context.emit(type, from, to);
|
context.emit(type, from, to);
|
||||||
context.emit(type, to, from);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,12 +7,10 @@ import com.jayway.jsonpath.JsonPath;
|
||||||
import com.jayway.jsonpath.Option;
|
import com.jayway.jsonpath.Option;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.config.Type;
|
import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.*;
|
||||||
import eu.dnetlib.pace.model.FieldListImpl;
|
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import net.minidev.json.JSONArray;
|
import net.minidev.json.JSONArray;
|
||||||
|
|
||||||
|
import java.math.BigDecimal;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -46,6 +44,14 @@ public class MapDocumentUtil {
|
||||||
.forEach(fi::add);
|
.forEach(fi::add);
|
||||||
stringField.put(fdef.getName(), fi);
|
stringField.put(fdef.getName(), fi);
|
||||||
break;
|
break;
|
||||||
|
case DoubleArray:
|
||||||
|
stringField.put(
|
||||||
|
fdef.getName(),
|
||||||
|
new FieldValueImpl(Type.DoubleArray,
|
||||||
|
fdef.getName(),
|
||||||
|
getJPathArray(fdef.getPath(), json))
|
||||||
|
);
|
||||||
|
break;
|
||||||
case StringConcat:
|
case StringConcat:
|
||||||
String[] jpaths = fdef.getPath().split("\\|\\|\\|");
|
String[] jpaths = fdef.getPath().split("\\|\\|\\|");
|
||||||
stringField.put(
|
stringField.put(
|
||||||
|
@ -115,6 +121,30 @@ public class MapDocumentUtil {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static double[] getJPathArray(final String jsonPath, final String json) {
|
||||||
|
try {
|
||||||
|
Object o = JsonPath.read(json, jsonPath);
|
||||||
|
if (o instanceof double[])
|
||||||
|
return (double[]) o;
|
||||||
|
if (o instanceof JSONArray) {
|
||||||
|
Object[] objects = ((JSONArray) o).toArray();
|
||||||
|
double[] array = new double[objects.length];
|
||||||
|
for (int i = 0; i < objects.length; i++) {
|
||||||
|
if (objects[i] instanceof BigDecimal)
|
||||||
|
array[i] = ((BigDecimal)objects[i]).doubleValue();
|
||||||
|
else
|
||||||
|
array[i] = (double) objects[i];
|
||||||
|
}
|
||||||
|
return array;
|
||||||
|
}
|
||||||
|
return new double[0];
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return new double[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static String truncateValue(String value, int length) {
|
public static String truncateValue(String value, int length) {
|
||||||
if (value == null)
|
if (value == null)
|
||||||
|
|
|
@ -36,6 +36,10 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
|
||||||
return new FieldValueImpl(Type.URL, "url", s);
|
return new FieldValueImpl(Type.URL, "url", s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected Field array(final double[] a) {
|
||||||
|
return new FieldValueImpl(Type.DoubleArray, "array", a);
|
||||||
|
}
|
||||||
|
|
||||||
protected Field createFieldList(List<String> strings, String fieldName){
|
protected Field createFieldList(List<String> strings, String fieldName){
|
||||||
|
|
||||||
List<FieldValueImpl> fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList());
|
List<FieldValueImpl> fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList());
|
||||||
|
|
|
@ -2,7 +2,9 @@ package eu.dnetlib.pace.comparators;
|
||||||
|
|
||||||
import eu.dnetlib.pace.AbstractPaceTest;
|
import eu.dnetlib.pace.AbstractPaceTest;
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
|
import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||||
import eu.dnetlib.pace.tree.*;
|
import eu.dnetlib.pace.tree.*;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
|
||||||
|
@ -284,5 +286,18 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void cosineSimilarity() {
|
||||||
|
|
||||||
|
CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
|
||||||
|
|
||||||
|
Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
|
||||||
|
Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
|
||||||
|
|
||||||
|
double compare = cosineSimilarity.compare(a, b, conf);
|
||||||
|
|
||||||
|
System.out.println("compare = " + compare);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,7 @@ import eu.dnetlib.pace.clustering.ClusteringClass;
|
||||||
import eu.dnetlib.pace.clustering.ClusteringCombiner;
|
import eu.dnetlib.pace.clustering.ClusteringCombiner;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
import eu.dnetlib.pace.model.FieldList;
|
||||||
|
import eu.dnetlib.pace.model.FieldValue;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.tree.JsonListMatch;
|
import eu.dnetlib.pace.tree.JsonListMatch;
|
||||||
import eu.dnetlib.pace.tree.support.AggType;
|
import eu.dnetlib.pace.tree.support.AggType;
|
||||||
|
@ -20,10 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.*;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
|
||||||
|
@ -104,15 +102,15 @@ public class ConfigTest extends AbstractPaceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void asMapDocumentTest2() {
|
public void authorAsMapDocument() {
|
||||||
|
|
||||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.test.conf.json"));
|
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json"));
|
||||||
|
|
||||||
final String json = readFromClasspath("author.json");
|
final String json = readFromClasspath("author.json");
|
||||||
|
|
||||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||||
|
|
||||||
System.out.println("mapDocument = " + mapDocument.getFieldMap().get("coauthors").stringValue());
|
System.out.println("mapDocument = " + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue()));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,71 +29,103 @@
|
||||||
},
|
},
|
||||||
"pace": {
|
"pace": {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "personClustering", "fields" : [ "fullname" ], "params" : {} },
|
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
|
||||||
{ "name" : "personHash", "fields" : [ "fullname" ], "params" : {} }
|
|
||||||
],
|
],
|
||||||
"decisionTree": {
|
"decisionTree": {
|
||||||
"start": {
|
"start": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "year",
|
"field": "orcid",
|
||||||
"comparator": "numbersComparator",
|
"comparator": "exactMatch",
|
||||||
"weight": 1,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "true",
|
||||||
"params": {}
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 50,
|
"threshold": 1.0,
|
||||||
"aggregation": "MAX",
|
"aggregation": "MAX",
|
||||||
"positive": "NO_MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "surnames",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "surnames",
|
"undefined": "orcids",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"surnames": {
|
"orcids": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "orcids",
|
||||||
|
"comparator": "stringListMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {"type": "count"}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 3.0,
|
||||||
|
"aggregation": "MAX",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "coauthors",
|
||||||
|
"undefined": "coauthors",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"coauthors": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "coauthors",
|
"field": "coauthors",
|
||||||
"comparator": "authorsMatch",
|
"comparator": "authorsMatch",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "true",
|
||||||
"params": {
|
"params": {"type": "count"}
|
||||||
"surname_th": 0.75,
|
|
||||||
"fullname_th": 0.75,
|
|
||||||
"size_th": 20,
|
|
||||||
"mode": "surname"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.6,
|
"threshold": 1.0,
|
||||||
|
"aggregation": "MAX",
|
||||||
|
"positive": "topicsMatch",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "topicsMatch",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"topicsMatch": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "topics",
|
||||||
|
"comparator": "cosineSimilarity",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1.0,
|
||||||
"aggregation": "MAX",
|
"aggregation": "MAX",
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "MATCH",
|
"undefined": "NO_MATCH",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "false"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"model": [
|
"model": [
|
||||||
|
{
|
||||||
|
"name": "topics",
|
||||||
|
"type": "DoubleArray",
|
||||||
|
"path": "$.topics"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "fullname",
|
"name": "fullname",
|
||||||
"type": "String",
|
"type": "String",
|
||||||
"path": "$.name"
|
"path": "$.fullname"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "orcid",
|
||||||
|
"type": "String",
|
||||||
|
"path": "$.orcid"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "coauthors",
|
"name": "coauthors",
|
||||||
"type": "List",
|
"type": "List",
|
||||||
"path": "$.coauthors[*].name",
|
"path": "$.coAuthors[*].fullname"
|
||||||
"size": 200
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "year",
|
"name": "orcids",
|
||||||
"type": "String",
|
"type": "List",
|
||||||
"path": "$.publication.year"
|
"path": "$.coAuthors[*].orcid"
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "title",
|
|
||||||
"type": "String",
|
|
||||||
"path": "$.publication.title"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"blacklists": {},
|
"blacklists": {},
|
|
@ -1 +1 @@
|
||||||
{"id": "f3389e7c8af1d806c06e2ab51f28a4b4", "name": "Aczél, János", "shortname": "Aczél, J.", "pid": "aczel.janos", "coauthors": [], "publication": {"year": "1955", "title": "L\\\"osung der Vektor-Funktionalgleichung der homogenen und inhomogenen $n$-dimensionalen einparametrigen ``Translation'' der erzeugenden Funktion von Kettenreaktionen und des station\\\"aren und nichtstation\\\"aren Bewegungsintegrals", "venue": "Acta Math. Acad. Sci. Hung. 6, 131-140 (1955)."}}
|
{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50|pmid________::db7fd19db5a620eafad40cfb97f9690d"}
|
Loading…
Reference in New Issue