implementation of the decision tree for the deduplication of the authors, implementation of multiple comparators to be used in a tree node and definition of the proto for person entity

This commit is contained in:
Michele De Bonis 2018-12-20 09:54:41 +01:00
parent 39613dbbd6
commit 7a8d28991f
18 changed files with 259 additions and 111 deletions

View File

@ -193,11 +193,38 @@ public abstract class AbstractPaceFunctions {
} }
public String normalizeCities(String s1, Map<String,String> cityMap){ public String normalizeCities(String s1, Map<String,String> cityMap){
//TODO change normalization mode
for (String city : cityMap.keySet()) for (String city : cityMap.keySet())
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " "); s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
return s1; return s1;
} }
public String normalizeCities2 (String s1, Map<String, String> cityMap, int windowSize){
List<String> tokens = Arrays.asList(s1.split(" "));
if (tokens.size()<windowSize)
windowSize = tokens.size();
int length = windowSize;
while (length != 0) {
for (int i = 0; i<=tokens.size()-length; i++){
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
if (cityMap.containsKey(candidate)) {
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + cityMap.get(candidate) + " ");
return s1;
}
}
length-=1;
}
return s1;
}
public String removeCodes(String s) { public String removeCodes(String s) {
final String regexKey = "\\bkey::[0-9]*\\b"; final String regexKey = "\\bkey::[0-9]*\\b";
final String regexCity = "\\bcity::[0-9]*\\b"; final String regexCity = "\\bcity::[0-9]*\\b";

View File

@ -47,9 +47,14 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
ca = translate(ca, translationMap); ca = translate(ca, translationMap);
cb = translate(cb, translationMap); cb = translate(cb, translationMap);
String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap); //replace cities with codes
ca = norm.split("\\|\\|\\|")[0].trim(); // String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
cb = norm.split("\\|\\|\\|")[1].trim(); // ca = norm.split("\\|\\|\\|")[0].trim();
// cb = norm.split("\\|\\|\\|")[1].trim();
ca = normalizeCities2(ca, cityMap, 4);
cb = normalizeCities2(cb, cityMap, 4);
if (sameCity(ca,cb)){ if (sameCity(ca,cb)){
if (sameKeywords(ca,cb)){ if (sameKeywords(ca,cb)){

View File

@ -0,0 +1,67 @@
package eu.dnetlib.pace.model;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
public class FieldConf implements Serializable {
private String field; //name of the field on which apply the comparator
private String comparator; //comparator name
private double weight = 1.0; //weight for the field (to be used in the aggregation)
private Map<String,Number> params; //parameters
public FieldConf() {
}
public FieldConf(String field, String comparator, double weight, Map<String, Number> params) {
this.field = field;
this.comparator = comparator;
this.weight = weight;
this.params = params;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
public String getComparator() {
return comparator;
}
public void setComparator(String comparator) {
this.comparator = comparator;
}
public double getWeight() {
return weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public Map<String, Number> getParams() {
return params;
}
public void setParams(Map<String, Number> params) {
this.params = params;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -1,60 +1,113 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.tree.TreeNode; import eu.dnetlib.pace.tree.Comparator;
import eu.dnetlib.pace.tree.support.AggType;
import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.Map; import java.util.List;
public class TreeNodeDef implements Serializable { public class TreeNodeDef implements Serializable {
private String name; private List<FieldConf> fields; //list of fields involved in the tree node (contains comparators to be used and field on which apply the comparator)
private String field; private AggType aggregation; //how to aggregate similarity measures for every field
private String positive; private double threshold; //threshold on the similarity measure
private String negative;
private String undefined;
private Map<String, Number> params; private String positive; //specifies the next node in case of positive result: similarity>=th
private String negative; //specifies the next node in case of negative result: similarity<th
private String undefined; //specifies the next node in case of undefined result: similarity=-1
boolean ignoreMissing = true; //specifies what to do in case of missing field
public TreeNodeDef() { public TreeNodeDef() {
} }
public TreeNodeDef(String name, String field, String positive, String negative, String undefined, Map<String, Number> params) { //compute the similarity measure between two documents
this.name = name; public double evaluate(MapDocument doc1, MapDocument doc2) {
this.field = field;
DescriptiveStatistics stats = new DescriptiveStatistics();
for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight();
double similarity = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
//if similarity is -1 means that a comparator gave undefined, do not add result to the stats
if (similarity != -1) {
stats.addValue(weight * similarity);
}
else {
if (!ignoreMissing) //if the missing value has not to be ignored, return -1
return -1;
}
}
switch (aggregation){
case AVG:
return stats.getMean();
case SUM:
return stats.getSum();
case MAX:
return stats.getMax();
case MIN:
return stats.getMin();
default:
return 0.0;
}
}
private Comparator comparator(final FieldConf field){
return PaceConfig.paceResolver.getComparator(field.getComparator(), field.getParams());
}
public TreeNodeDef(List<FieldConf> fields, double threshold, AggType aggregation, String positive, String negative, String undefined) {
this.fields = fields;
this.threshold = threshold;
this.aggregation = aggregation;
this.positive = positive; this.positive = positive;
this.negative = negative; this.negative = negative;
this.undefined = undefined; this.undefined = undefined;
this.params = params;
} }
public TreeNode treeNode() { public boolean isIgnoreMissing() {
try { return ignoreMissing;
return PaceConfig.paceResolver.getTreeNode(getName(), params);
} catch (PaceException e) {
e.printStackTrace();
return null;
}
} }
public String getName() { public void setIgnoreMissing(boolean ignoreMissing) {
return name; this.ignoreMissing = ignoreMissing;
} }
public void setName(String name) { public List<FieldConf> getFields() {
this.name = name; return fields;
} }
public String getField() { public void setFields(List<FieldConf> fields) {
return field; this.fields = fields;
} }
public void setField(String field) { public double getThreshold() {
this.field = field; return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public AggType getAggregation() {
return aggregation;
}
public void setAggregation(AggType aggregation) {
this.aggregation = aggregation;
} }
public String getPositive() { public String getPositive() {
@ -81,20 +134,12 @@ public class TreeNodeDef implements Serializable {
this.undefined = undefined; this.undefined = undefined;
} }
public Map<String, Number> getParams() {
return params;
}
public void setParams(Map<String, Number> params) {
this.params = params;
}
@Override @Override
public String toString() { public String toString() {
try { try {
return new ObjectMapper().writeValueAsString(this); return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) { } catch (IOException e) {
return e.getStackTrace().toString(); throw new PaceException("Impossible to convert to JSON: ", e);
} }
} }
} }

View File

@ -5,17 +5,17 @@ import org.apache.commons.lang.StringUtils;
import java.util.Map; import java.util.Map;
public class AbstractTreeNode implements TreeNode { abstract class AbstractComparator implements Comparator {
Map<String, Number> params; Map<String, Number> params;
public AbstractTreeNode(Map<String, Number> params){ public AbstractComparator(Map<String, Number> params){
this.params = params; this.params = params;
} }
@Override @Override
public int compare(Field a, Field b) { public double compare(Field a, Field b) {
return 0; return 0.0;
} }
public static double stringSimilarity(String s1, String s2) { public static double stringSimilarity(String s1, String s2) {

View File

@ -6,15 +6,15 @@ import eu.dnetlib.pace.model.FieldList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@TreeNodeClass("coauthorsMatch") @ComparatorClass("coauthorsMatch")
public class CoauthorsMatch extends AbstractTreeNode { public class CoauthorsMatch extends AbstractComparator {
public CoauthorsMatch(Map<String, Number> params) { public CoauthorsMatch(Map<String, Number> params) {
super(params); super(params);
} }
@Override @Override
public int compare(Field a, Field b) { public double compare(Field a, Field b) {
final List<String> c1 = ((FieldList) a).stringList(); final List<String> c1 = ((FieldList) a).stringList();
final List<String> c2 = ((FieldList) b).stringList(); final List<String> c2 = ((FieldList) b).stringList();
@ -24,7 +24,7 @@ public class CoauthorsMatch extends AbstractTreeNode {
//few coauthors or too many coauthors //few coauthors or too many coauthors
if (size1 < params.getOrDefault("minCoauthors", 5).intValue() || size2 < params.getOrDefault("minCoauthors", 5).intValue() || (size1+size2 > params.getOrDefault("maxCoauthors", 200).intValue())) if (size1 < params.getOrDefault("minCoauthors", 5).intValue() || size2 < params.getOrDefault("minCoauthors", 5).intValue() || (size1+size2 > params.getOrDefault("maxCoauthors", 200).intValue()))
return 0; return -1;
int coauthorship = 0; int coauthorship = 0;
for (String ca1: c1){ for (String ca1: c1){
@ -36,11 +36,7 @@ public class CoauthorsMatch extends AbstractTreeNode {
} }
} }
if (coauthorship>=params.getOrDefault("th", 5).intValue()) return coauthorship;
return 1;
else if (coauthorship == 0)
return -1;
else
return 0;
} }
} }

View File

@ -0,0 +1,10 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.model.Field;
public interface Comparator {
//compare two fields and returns: the distace measure, -1 if undefined
public double compare(Field a, Field b);
}

View File

@ -7,7 +7,7 @@ import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME) @Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE) @Target(ElementType.TYPE)
public @interface TreeNodeClass { public @interface ComparatorClass {
public String value(); public String value();
} }

View File

@ -4,22 +4,22 @@ import eu.dnetlib.pace.model.Field;
import java.util.Map; import java.util.Map;
@TreeNodeClass("exactMatch") @ComparatorClass("exactMatch")
public class ExactMatch extends AbstractTreeNode { public class ExactMatch extends AbstractComparator {
public ExactMatch(Map<String, Number> params) { public ExactMatch(Map<String, Number> params) {
super(params); super(params);
} }
@Override @Override
public int compare(Field a, Field b) { public double compare(Field a, Field b) {
if (a.stringValue().isEmpty() || b.stringValue().isEmpty()) if (a.stringValue().isEmpty() || b.stringValue().isEmpty())
return 0; return -1;
else if (a.stringValue().equals(b.stringValue())) else if (a.stringValue().equals(b.stringValue()))
return 1; return 1;
else else
return -1; return 0;
} }
} }

View File

@ -4,18 +4,18 @@ import eu.dnetlib.pace.model.Field;
import java.util.Map; import java.util.Map;
@TreeNodeClass("similar") @ComparatorClass("similar")
public class SimilarMatch extends AbstractTreeNode { public class SimilarMatch extends AbstractComparator {
public SimilarMatch(Map<String, Number> params) { public SimilarMatch(Map<String, Number> params) {
super(params); super(params);
} }
@Override @Override
public int compare(Field a, Field b) { public double compare(Field a, Field b) {
if (a.stringValue().isEmpty() || b.stringValue().isEmpty()) if (a.stringValue().isEmpty() || b.stringValue().isEmpty())
return 0; //undefined if one name is missing return -1; //undefined if one name is missing
//take only the first name //take only the first name
String firstname1 = a.stringValue().split(" ")[0]; String firstname1 = a.stringValue().split(" ")[0];
@ -24,12 +24,7 @@ public class SimilarMatch extends AbstractTreeNode {
if (firstname1.toLowerCase().trim().replaceAll("\\.","").replaceAll("\\s","").length()<=2 || firstname2.toLowerCase().replaceAll("\\.", "").replaceAll("\\s","").length()<=2) //too short names (considered similar) if (firstname1.toLowerCase().trim().replaceAll("\\.","").replaceAll("\\s","").length()<=2 || firstname2.toLowerCase().replaceAll("\\.", "").replaceAll("\\s","").length()<=2) //too short names (considered similar)
return 1; return 1;
if (stringSimilarity(firstname1,firstname2)>params.getOrDefault("th", 0.7).doubleValue()){ return stringSimilarity(firstname1,firstname2);
return 1; //similar names, go on with the analysis
}
else {
return -1; //names too different, no need to compare
}
} }

View File

@ -5,21 +5,21 @@ import eu.dnetlib.pace.model.FieldListImpl;
import java.util.Map; import java.util.Map;
@TreeNodeClass("topicsMatch") @ComparatorClass("topicsMatch")
public class TopicsMatch extends AbstractTreeNode { public class TopicsMatch extends AbstractComparator {
public TopicsMatch(Map<String, Number> params) { public TopicsMatch(Map<String, Number> params) {
super(params); super(params);
} }
@Override @Override
public int compare(Field a, Field b) { public double compare(Field a, Field b) {
double[] t1 = ((FieldListImpl) a).doubleArray(); double[] t1 = ((FieldListImpl) a).doubleArray();
double[] t2 = ((FieldListImpl) b).doubleArray(); double[] t2 = ((FieldListImpl) b).doubleArray();
if (t1 == null || t2 == null) if (t1 == null || t2 == null)
return 0; //0 similarity if no topics in one of the authors or in both return -1; //0 similarity if no topics in one of the authors or in both
double area = 0.0; double area = 0.0;
@ -30,7 +30,7 @@ public class TopicsMatch extends AbstractTreeNode {
area += min_value[i]; area += min_value[i];
} }
return area>params.getOrDefault("th", 0.7).doubleValue()?+1:-1; return area;
} }
} }

View File

@ -1,10 +0,0 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.model.Field;
public interface TreeNode {
//compare two fields and returns: +1 if match, 0 if undefined, -1 if do not match
public int compare(Field a, Field b);
}

View File

@ -6,13 +6,13 @@ import eu.dnetlib.pace.model.FieldList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@TreeNodeClass("undefined") @ComparatorClass("undefined")
public class UndefinedNode implements TreeNode { public class UndefinedNode implements Comparator {
Map<String, Number> params; Map<String, Number> params;
@Override @Override
public int compare(Field a, Field b) { public double compare(Field a, Field b) {
final List<String> sa = ((FieldList) a).stringList(); final List<String> sa = ((FieldList) a).stringList();
final List<String> sb = ((FieldList) b).stringList(); final List<String> sb = ((FieldList) b).stringList();

View File

@ -0,0 +1,9 @@
package eu.dnetlib.pace.tree.support;
public enum AggType {
AVG,
SUM,
MAX,
MIN
}

View File

@ -1,13 +1,12 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
public enum MatchType { public enum MatchType {
ORCID_MATCH, ORCID_MATCH,
COAUTHORS_MATCH, COAUTHORS_MATCH,
TOPICS_MATCH, TOPICS_MATCH,
NO_MATCH; NO_MATCH,
UNDEFINED;
public static MatchType getEnum(String value) { public static MatchType getEnum(String value) {
@ -15,7 +14,7 @@ public enum MatchType {
return MatchType.valueOf(value); return MatchType.valueOf(value);
} }
catch (IllegalArgumentException e) { catch (IllegalArgumentException e) {
throw new PaceException("The match type is not valid"); return MatchType.UNDEFINED;
} }
} }
} }

View File

@ -70,36 +70,40 @@ public class BlockProcessor {
final String idCurr = curr.getIdentifier(); final String idCurr = curr.getIdentifier();
//check if pivot and current element are similar by processing the tree //check if pivot and current element are similar by processing the tree
if (navigateTree(pivot, curr)) if (navigateTree(pivot, curr)!=MatchType.NO_MATCH)
writeSimilarity(context, idPivot, idCurr); writeSimilarity(context, idPivot, idCurr);
} }
} }
} }
private boolean navigateTree(final MapDocument doc1, final MapDocument doc2){ private MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){
final Map<String, TreeNodeDef> decisionTree = dedupConf.getPace().getDecisionTree(); final Map<String, TreeNodeDef> decisionTree = dedupConf.getPace().getDecisionTree();
String current = "start"; String current = "start";
while (!current.equals(MatchType.NO_MATCH.toString()) && !current.equals(MatchType.ORCID_MATCH.toString()) && !current.equals(MatchType.TOPICS_MATCH.toString()) && !current.equals(MatchType.COAUTHORS_MATCH.toString())) { while (MatchType.getEnum(current)==MatchType.UNDEFINED) {
TreeNodeDef currentNode = decisionTree.get(current); TreeNodeDef currentNode = decisionTree.get(current);
//throw an exception if the node doesn't exist //throw an exception if the node doesn't exist
if (currentNode == null) if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current); throw new PaceException("The Tree Node doesn't exist: " + current);
int compare = currentNode.treeNode().compare(doc1.getFieldMap().get(currentNode.getField()), doc2.getFieldMap().get(currentNode.getField())); double similarity = currentNode.evaluate(doc1, doc2);
current = (compare==0)?currentNode.getUndefined():(compare==-1)?currentNode.getNegative():currentNode.getPositive(); if (similarity == -1) {
current = currentNode.getUndefined();
}
else if (similarity>=currentNode.getThreshold()){
current = currentNode.getPositive();
}
else {
current = currentNode.getNegative();
} }
if (!current.equals(MatchType.NO_MATCH.toString())) }
return true;
else return MatchType.getEnum(current);
return false;
} }
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) { private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {

View File

@ -7,8 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass;
import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.TreeNode; import eu.dnetlib.pace.tree.Comparator;
import eu.dnetlib.pace.tree.TreeNodeClass; import eu.dnetlib.pace.tree.ComparatorClass;
import org.reflections.Reflections; import org.reflections.Reflections;
import java.io.Serializable; import java.io.Serializable;
@ -22,7 +22,7 @@ public class PaceResolver implements Serializable {
private final Map<String, Class<ClusteringFunction>> clusteringFunctions; private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<ConditionAlgo>> conditionAlgos; private final Map<String, Class<ConditionAlgo>> conditionAlgos;
private final Map<String, Class<DistanceAlgo>> distanceAlgos; private final Map<String, Class<DistanceAlgo>> distanceAlgos;
private final Map<String, Class<TreeNode>> treeNodes; private final Map<String, Class<Comparator>> comparators;
public PaceResolver() { public PaceResolver() {
@ -38,9 +38,9 @@ public class PaceResolver implements Serializable {
.filter(DistanceAlgo.class::isAssignableFrom) .filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl)); .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
this.treeNodes = new Reflections("eu.dnetlib").getTypesAnnotatedWith(TreeNodeClass.class).stream() this.comparators = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ComparatorClass.class).stream()
.filter(TreeNode.class::isAssignableFrom) .filter(Comparator.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(TreeNodeClass.class).value(), cl -> (Class<TreeNode>) cl)); .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>) cl));
} }
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException { public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
@ -67,9 +67,9 @@ public class PaceResolver implements Serializable {
} }
} }
public TreeNode getTreeNode(String name, Map<String, Number> params) throws PaceException { public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
try { try {
return treeNodes.get(name).getDeclaredConstructor(Map.class).newInstance(params); return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
throw new PaceException(name + " not found ", e); throw new PaceException(name + " not found ", e);
} }

View File

@ -56,9 +56,10 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void testJaroWinklerNormalizedName2() { public void testJaroWinklerNormalizedName2() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("University of Pisa", "Universita degli studi di Pisa"); double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York");
assertEquals(result, 1.0); assertEquals(result, 1.0);
} }
} }