implementation of the decision tree for the deduplication of the authors, implementation of multiple comparators to be used in a tree node and definition of the proto for person entity

This commit is contained in:
Michele De Bonis 2018-12-20 09:54:41 +01:00
parent 0bd20c565a
commit 9ff83d6567
20 changed files with 268 additions and 119 deletions

View File

@ -1,2 +1,2 @@
{ "type": 30, "id": "30|author::id1", "person": { "metadata":{"orcid": "orcid1", "fullname": "smith, john", "firstname": "john", "lastname": "smith", "pubID": "pubid1", "pubDOI": "pubdoi1", "coauthors": ["la bruzzo, sandro", "atzori, claudio", "baglioni, miriam", "bardi, alessia"], "topics": [0.0,0.0,0.0], "rank":1, "area":"1"}}}
{ "type": 30, "id": "30|author::id2", "person": { "metadata":{"orcid": "orcid2", "fullname": "smith, john", "firstname": "john", "lastname": "smith", "pubID": "pubid2", "pubDOI": "pubdoi2", "coauthors": ["la bruzzo, sandro", "atzori, claudio", "baglioni, miriam", "bardi, alessia"], "topics": [0.0,0.0,0.0], "rank":3, "area":"1"}}}
{ "type": 30, "id": "30|author::id2", "person": { "metadata":{"orcid": "", "fullname": "smith, john", "firstname": "john", "lastname": "smith", "pubID": "pubid2", "pubDOI": "pubdoi2", "coauthors": ["la bruzzo, sandro", "atzori, claudio", "baglioni, miriam", "bardi, alessia"], "topics": [0.0,0.0,0.0], "rank":3, "area":"1"}}}

View File

@ -16,12 +16,12 @@
],
"conditions": [],
"decisionTree": {
"start": {"name": "exactMatch", "field": "pubID", "positive": "NO_MATCH", "negative": "layer2", "undefined": "layer2", "params": {}},
"layer2": {"name": "exactMatch", "field": "orcid", "positive": "ORCID_MATCH", "negative": "NO_MATCH", "undefined": "layer3", "params": {}},
"layer3": {"name": "similar", "field": "firstname", "positive": "layer4", "negative": "NO_MATCH", "undefined": "layer4", "params": { "th": 0.7}},
"layer4": {"name": "coauthorsMatch", "field": "coauthors", "positive": "COAUTHORS_MATCH", "negative": "NO_MATCH", "undefined": "layer5", "params": {"th": 5, "minCoauthors": 6, "maxCoauthors": 200}},
"layer5": {"name" : "exactMatch", "field": "area", "positive": "layer6", "negative": "NO_MATCH", "undefined": "NO_MATCH", "params": {}},
"layer6": {"name": "topicsMatch", "field": "topics", "positive": "TOPICS_MATCH", "negative": "NO_MATCH", "undefined": "NO_MATCH", "params": {"th": 0.7}}
"start": {"fields": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"},
"layer2": {"fields": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"},
"layer3": {"fields": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"},
"layer4": {"fields": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"},
"layer5": {"fields": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"},
"layer6": {"fields": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}
},
"model": [
{"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/fullname"},
@ -32,7 +32,8 @@
{"name": "topics", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/topics"},
{"name": "pubID", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/pubID"},
{"name": "pubDOI", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/pubDOI"},
{"name": "rank", "algo": "Null", "type": "Int", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/rank"}
{"name": "rank", "algo": "Null", "type": "Int", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/rank"},
{"name": "area", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/area"}
],
"blacklists": {}
}

View File

@ -193,11 +193,38 @@ public abstract class AbstractPaceFunctions {
}
public String normalizeCities(String s1, Map<String,String> cityMap){
//TODO change normalization mode
for (String city : cityMap.keySet())
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
return s1;
}
public String normalizeCities2 (String s1, Map<String, String> cityMap, int windowSize){
List<String> tokens = Arrays.asList(s1.split(" "));
if (tokens.size()<windowSize)
windowSize = tokens.size();
int length = windowSize;
while (length != 0) {
for (int i = 0; i<=tokens.size()-length; i++){
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
if (cityMap.containsKey(candidate)) {
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + cityMap.get(candidate) + " ");
return s1;
}
}
length-=1;
}
return s1;
}
public String removeCodes(String s) {
final String regexKey = "\\bkey::[0-9]*\\b";
final String regexCity = "\\bcity::[0-9]*\\b";

View File

@ -47,9 +47,14 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
ca = translate(ca, translationMap);
cb = translate(cb, translationMap);
String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
ca = norm.split("\\|\\|\\|")[0].trim();
cb = norm.split("\\|\\|\\|")[1].trim();
//replace cities with codes
// String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
// ca = norm.split("\\|\\|\\|")[0].trim();
// cb = norm.split("\\|\\|\\|")[1].trim();
ca = normalizeCities2(ca, cityMap, 4);
cb = normalizeCities2(cb, cityMap, 4);
if (sameCity(ca,cb)){
if (sameKeywords(ca,cb)){

View File

@ -0,0 +1,67 @@
package eu.dnetlib.pace.model;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
public class FieldConf implements Serializable {
private String field; //name of the field on which apply the comparator
private String comparator; //comparator name
private double weight = 1.0; //weight for the field (to be used in the aggregation)
private Map<String,Number> params; //parameters
public FieldConf() {
}
public FieldConf(String field, String comparator, double weight, Map<String, Number> params) {
this.field = field;
this.comparator = comparator;
this.weight = weight;
this.params = params;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
public String getComparator() {
return comparator;
}
public void setComparator(String comparator) {
this.comparator = comparator;
}
public double getWeight() {
return weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public Map<String, Number> getParams() {
return params;
}
public void setParams(Map<String, Number> params) {
this.params = params;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -1,60 +1,113 @@
package eu.dnetlib.pace.model;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.tree.TreeNode;
import eu.dnetlib.pace.tree.Comparator;
import eu.dnetlib.pace.tree.support.AggType;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import java.util.List;
public class TreeNodeDef implements Serializable {
private String name;
private String field;
private List<FieldConf> fields; //list of fields involved in the tree node (contains comparators to be used and field on which apply the comparator)
private AggType aggregation; //how to aggregate similarity measures for every field
private String positive;
private String negative;
private String undefined;
private double threshold; //threshold on the similarity measure
private Map<String, Number> params;
private String positive; //specifies the next node in case of positive result: similarity>=th
private String negative; //specifies the next node in case of negative result: similarity<th
private String undefined; //specifies the next node in case of undefined result: similarity=-1
boolean ignoreMissing = true; //specifies what to do in case of missing field
public TreeNodeDef() {
}
public TreeNodeDef(String name, String field, String positive, String negative, String undefined, Map<String, Number> params) {
this.name = name;
this.field = field;
//compute the similarity measure between two documents
public double evaluate(MapDocument doc1, MapDocument doc2) {
DescriptiveStatistics stats = new DescriptiveStatistics();
for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight();
double similarity = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
//if similarity is -1 means that a comparator gave undefined, do not add result to the stats
if (similarity != -1) {
stats.addValue(weight * similarity);
}
else {
if (!ignoreMissing) //if the missing value has not to be ignored, return -1
return -1;
}
}
switch (aggregation){
case AVG:
return stats.getMean();
case SUM:
return stats.getSum();
case MAX:
return stats.getMax();
case MIN:
return stats.getMin();
default:
return 0.0;
}
}
private Comparator comparator(final FieldConf field){
return PaceConfig.paceResolver.getComparator(field.getComparator(), field.getParams());
}
public TreeNodeDef(List<FieldConf> fields, double threshold, AggType aggregation, String positive, String negative, String undefined) {
this.fields = fields;
this.threshold = threshold;
this.aggregation = aggregation;
this.positive = positive;
this.negative = negative;
this.undefined = undefined;
this.params = params;
}
public TreeNode treeNode() {
try {
return PaceConfig.paceResolver.getTreeNode(getName(), params);
} catch (PaceException e) {
e.printStackTrace();
return null;
}
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public String getName() {
return name;
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
public void setName(String name) {
this.name = name;
public List<FieldConf> getFields() {
return fields;
}
public String getField() {
return field;
public void setFields(List<FieldConf> fields) {
this.fields = fields;
}
public void setField(String field) {
this.field = field;
public double getThreshold() {
return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public AggType getAggregation() {
return aggregation;
}
public void setAggregation(AggType aggregation) {
this.aggregation = aggregation;
}
public String getPositive() {
@ -81,20 +134,12 @@ public class TreeNodeDef implements Serializable {
this.undefined = undefined;
}
public Map<String, Number> getParams() {
return params;
}
public void setParams(Map<String, Number> params) {
this.params = params;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
return e.getStackTrace().toString();
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -5,17 +5,17 @@ import org.apache.commons.lang.StringUtils;
import java.util.Map;
public class AbstractTreeNode implements TreeNode {
abstract class AbstractComparator implements Comparator {
Map<String, Number> params;
public AbstractTreeNode(Map<String, Number> params){
public AbstractComparator(Map<String, Number> params){
this.params = params;
}
@Override
public int compare(Field a, Field b) {
return 0;
public double compare(Field a, Field b) {
return 0.0;
}
public static double stringSimilarity(String s1, String s2) {

View File

@ -6,15 +6,15 @@ import eu.dnetlib.pace.model.FieldList;
import java.util.List;
import java.util.Map;
@TreeNodeClass("coauthorsMatch")
public class CoauthorsMatch extends AbstractTreeNode {
@ComparatorClass("coauthorsMatch")
public class CoauthorsMatch extends AbstractComparator {
public CoauthorsMatch(Map<String, Number> params) {
super(params);
}
@Override
public int compare(Field a, Field b) {
public double compare(Field a, Field b) {
final List<String> c1 = ((FieldList) a).stringList();
final List<String> c2 = ((FieldList) b).stringList();
@ -24,7 +24,7 @@ public class CoauthorsMatch extends AbstractTreeNode {
//few coauthors or too many coauthors
if (size1 < params.getOrDefault("minCoauthors", 5).intValue() || size2 < params.getOrDefault("minCoauthors", 5).intValue() || (size1+size2 > params.getOrDefault("maxCoauthors", 200).intValue()))
return 0;
return -1;
int coauthorship = 0;
for (String ca1: c1){
@ -36,11 +36,7 @@ public class CoauthorsMatch extends AbstractTreeNode {
}
}
if (coauthorship>=params.getOrDefault("th", 5).intValue())
return 1;
else if (coauthorship == 0)
return -1;
else
return 0;
return coauthorship;
}
}

View File

@ -0,0 +1,10 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.model.Field;
public interface Comparator {
//compare two fields and returns: the distace measure, -1 if undefined
public double compare(Field a, Field b);
}

View File

@ -7,7 +7,7 @@ import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface TreeNodeClass {
public @interface ComparatorClass {
public String value();
}

View File

@ -4,22 +4,22 @@ import eu.dnetlib.pace.model.Field;
import java.util.Map;
@TreeNodeClass("exactMatch")
public class ExactMatch extends AbstractTreeNode {
@ComparatorClass("exactMatch")
public class ExactMatch extends AbstractComparator {
public ExactMatch(Map<String, Number> params) {
super(params);
}
@Override
public int compare(Field a, Field b) {
public double compare(Field a, Field b) {
if (a.stringValue().isEmpty() || b.stringValue().isEmpty())
return 0;
return -1;
else if (a.stringValue().equals(b.stringValue()))
return 1;
else
return -1;
return 0;
}
}

View File

@ -4,18 +4,18 @@ import eu.dnetlib.pace.model.Field;
import java.util.Map;
@TreeNodeClass("similar")
public class SimilarMatch extends AbstractTreeNode {
@ComparatorClass("similar")
public class SimilarMatch extends AbstractComparator {
public SimilarMatch(Map<String, Number> params) {
super(params);
}
@Override
public int compare(Field a, Field b) {
public double compare(Field a, Field b) {
if (a.stringValue().isEmpty() || b.stringValue().isEmpty())
return 0; //undefined if one name is missing
return -1; //undefined if one name is missing
//take only the first name
String firstname1 = a.stringValue().split(" ")[0];
@ -24,12 +24,7 @@ public class SimilarMatch extends AbstractTreeNode {
if (firstname1.toLowerCase().trim().replaceAll("\\.","").replaceAll("\\s","").length()<=2 || firstname2.toLowerCase().replaceAll("\\.", "").replaceAll("\\s","").length()<=2) //too short names (considered similar)
return 1;
if (stringSimilarity(firstname1,firstname2)>params.getOrDefault("th", 0.7).doubleValue()){
return 1; //similar names, go on with the analysis
}
else {
return -1; //names too different, no need to compare
}
return stringSimilarity(firstname1,firstname2);
}

View File

@ -5,21 +5,21 @@ import eu.dnetlib.pace.model.FieldListImpl;
import java.util.Map;
@TreeNodeClass("topicsMatch")
public class TopicsMatch extends AbstractTreeNode {
@ComparatorClass("topicsMatch")
public class TopicsMatch extends AbstractComparator {
public TopicsMatch(Map<String, Number> params) {
super(params);
}
@Override
public int compare(Field a, Field b) {
public double compare(Field a, Field b) {
double[] t1 = ((FieldListImpl) a).doubleArray();
double[] t2 = ((FieldListImpl) b).doubleArray();
if (t1 == null || t2 == null)
return 0; //0 similarity if no topics in one of the authors or in both
return -1; //0 similarity if no topics in one of the authors or in both
double area = 0.0;
@ -30,7 +30,7 @@ public class TopicsMatch extends AbstractTreeNode {
area += min_value[i];
}
return area>params.getOrDefault("th", 0.7).doubleValue()?+1:-1;
return area;
}
}

View File

@ -1,10 +0,0 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.model.Field;
public interface TreeNode {
//compare two fields and returns: +1 if match, 0 if undefined, -1 if do not match
public int compare(Field a, Field b);
}

View File

@ -6,13 +6,13 @@ import eu.dnetlib.pace.model.FieldList;
import java.util.List;
import java.util.Map;
@TreeNodeClass("undefined")
public class UndefinedNode implements TreeNode {
@ComparatorClass("undefined")
public class UndefinedNode implements Comparator {
Map<String, Number> params;
@Override
public int compare(Field a, Field b) {
public double compare(Field a, Field b) {
final List<String> sa = ((FieldList) a).stringList();
final List<String> sb = ((FieldList) b).stringList();

View File

@ -0,0 +1,9 @@
package eu.dnetlib.pace.tree.support;
public enum AggType {
AVG,
SUM,
MAX,
MIN
}

View File

@ -1,13 +1,12 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
public enum MatchType {
ORCID_MATCH,
COAUTHORS_MATCH,
TOPICS_MATCH,
NO_MATCH;
NO_MATCH,
UNDEFINED;
public static MatchType getEnum(String value) {
@ -15,7 +14,7 @@ public enum MatchType {
return MatchType.valueOf(value);
}
catch (IllegalArgumentException e) {
throw new PaceException("The match type is not valid");
return MatchType.UNDEFINED;
}
}
}

View File

@ -70,36 +70,40 @@ public class BlockProcessor {
final String idCurr = curr.getIdentifier();
//check if pivot and current element are similar by processing the tree
if (navigateTree(pivot, curr))
if (navigateTree(pivot, curr)!=MatchType.NO_MATCH)
writeSimilarity(context, idPivot, idCurr);
}
}
}
private boolean navigateTree(final MapDocument doc1, final MapDocument doc2){
private MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){
final Map<String, TreeNodeDef> decisionTree = dedupConf.getPace().getDecisionTree();
String current = "start";
while (!current.equals(MatchType.NO_MATCH.toString()) && !current.equals(MatchType.ORCID_MATCH.toString()) && !current.equals(MatchType.TOPICS_MATCH.toString()) && !current.equals(MatchType.COAUTHORS_MATCH.toString())) {
while (MatchType.getEnum(current)==MatchType.UNDEFINED) {
TreeNodeDef currentNode = decisionTree.get(current);
//throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current);
int compare = currentNode.treeNode().compare(doc1.getFieldMap().get(currentNode.getField()), doc2.getFieldMap().get(currentNode.getField()));
double similarity = currentNode.evaluate(doc1, doc2);
if (similarity == -1) {
current = currentNode.getUndefined();
}
else if (similarity>=currentNode.getThreshold()){
current = currentNode.getPositive();
}
else {
current = currentNode.getNegative();
}
current = (compare==0)?currentNode.getUndefined():(compare==-1)?currentNode.getNegative():currentNode.getPositive();
}
if (!current.equals(MatchType.NO_MATCH.toString()))
return true;
else
return false;
return MatchType.getEnum(current);
}
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {

View File

@ -7,8 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.TreeNode;
import eu.dnetlib.pace.tree.TreeNodeClass;
import eu.dnetlib.pace.tree.Comparator;
import eu.dnetlib.pace.tree.ComparatorClass;
import org.reflections.Reflections;
import java.io.Serializable;
@ -22,7 +22,7 @@ public class PaceResolver implements Serializable {
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
private final Map<String, Class<TreeNode>> treeNodes;
private final Map<String, Class<Comparator>> comparators;
public PaceResolver() {
@ -38,9 +38,9 @@ public class PaceResolver implements Serializable {
.filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
this.treeNodes = new Reflections("eu.dnetlib").getTypesAnnotatedWith(TreeNodeClass.class).stream()
.filter(TreeNode.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(TreeNodeClass.class).value(), cl -> (Class<TreeNode>) cl));
this.comparators = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ComparatorClass.class).stream()
.filter(Comparator.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>) cl));
}
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
@ -67,9 +67,9 @@ public class PaceResolver implements Serializable {
}
}
public TreeNode getTreeNode(String name, Map<String, Number> params) throws PaceException {
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
try {
return treeNodes.get(name).getDeclaredConstructor(Map.class).newInstance(params);
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
throw new PaceException(name + " not found ", e);
}

View File

@ -56,9 +56,10 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void testJaroWinklerNormalizedName2() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("University of Pisa", "Universita degli studi di Pisa");
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York");
assertEquals(result, 1.0);
}
}