forked from D-Net/dnet-hadoop
implementation of the decision tree for the deduplication of the authors, implementation of multiple comparators to be used in a tree node and definition of the proto for person entity
This commit is contained in:
parent
39613dbbd6
commit
7a8d28991f
|
@ -193,11 +193,38 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String normalizeCities(String s1, Map<String,String> cityMap){
|
public String normalizeCities(String s1, Map<String,String> cityMap){
|
||||||
|
//TODO change normalization mode
|
||||||
|
|
||||||
for (String city : cityMap.keySet())
|
for (String city : cityMap.keySet())
|
||||||
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
|
s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " ");
|
||||||
return s1;
|
return s1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String normalizeCities2 (String s1, Map<String, String> cityMap, int windowSize){
|
||||||
|
|
||||||
|
List<String> tokens = Arrays.asList(s1.split(" "));
|
||||||
|
|
||||||
|
if (tokens.size()<windowSize)
|
||||||
|
windowSize = tokens.size();
|
||||||
|
|
||||||
|
int length = windowSize;
|
||||||
|
|
||||||
|
while (length != 0) {
|
||||||
|
|
||||||
|
for (int i = 0; i<=tokens.size()-length; i++){
|
||||||
|
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
|
||||||
|
if (cityMap.containsKey(candidate)) {
|
||||||
|
s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + cityMap.get(candidate) + " ");
|
||||||
|
return s1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
length-=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return s1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public String removeCodes(String s) {
|
public String removeCodes(String s) {
|
||||||
final String regexKey = "\\bkey::[0-9]*\\b";
|
final String regexKey = "\\bkey::[0-9]*\\b";
|
||||||
final String regexCity = "\\bcity::[0-9]*\\b";
|
final String regexCity = "\\bcity::[0-9]*\\b";
|
||||||
|
|
|
@ -47,9 +47,14 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
ca = translate(ca, translationMap);
|
ca = translate(ca, translationMap);
|
||||||
cb = translate(cb, translationMap);
|
cb = translate(cb, translationMap);
|
||||||
|
|
||||||
String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
|
//replace cities with codes
|
||||||
ca = norm.split("\\|\\|\\|")[0].trim();
|
// String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap);
|
||||||
cb = norm.split("\\|\\|\\|")[1].trim();
|
// ca = norm.split("\\|\\|\\|")[0].trim();
|
||||||
|
// cb = norm.split("\\|\\|\\|")[1].trim();
|
||||||
|
|
||||||
|
ca = normalizeCities2(ca, cityMap, 4);
|
||||||
|
cb = normalizeCities2(cb, cityMap, 4);
|
||||||
|
|
||||||
|
|
||||||
if (sameCity(ca,cb)){
|
if (sameCity(ca,cb)){
|
||||||
if (sameKeywords(ca,cb)){
|
if (sameKeywords(ca,cb)){
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class FieldConf implements Serializable {
|
||||||
|
|
||||||
|
private String field; //name of the field on which apply the comparator
|
||||||
|
private String comparator; //comparator name
|
||||||
|
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||||
|
private Map<String,Number> params; //parameters
|
||||||
|
|
||||||
|
public FieldConf() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public FieldConf(String field, String comparator, double weight, Map<String, Number> params) {
|
||||||
|
this.field = field;
|
||||||
|
this.comparator = comparator;
|
||||||
|
this.weight = weight;
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getField() {
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setField(String field) {
|
||||||
|
this.field = field;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getComparator() {
|
||||||
|
return comparator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setComparator(String comparator) {
|
||||||
|
this.comparator = comparator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getWeight() {
|
||||||
|
return weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWeight(double weight) {
|
||||||
|
this.weight = weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Number> getParams() {
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParams(Map<String, Number> params) {
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
try {
|
||||||
|
return new ObjectMapper().writeValueAsString(this);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,60 +1,113 @@
|
||||||
package eu.dnetlib.pace.model;
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.PaceConfig;
|
import eu.dnetlib.pace.config.PaceConfig;
|
||||||
import eu.dnetlib.pace.tree.TreeNode;
|
import eu.dnetlib.pace.tree.Comparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.AggType;
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Map;
|
import java.util.List;
|
||||||
|
|
||||||
public class TreeNodeDef implements Serializable {
|
public class TreeNodeDef implements Serializable {
|
||||||
|
|
||||||
private String name;
|
private List<FieldConf> fields; //list of fields involved in the tree node (contains comparators to be used and field on which apply the comparator)
|
||||||
private String field;
|
private AggType aggregation; //how to aggregate similarity measures for every field
|
||||||
|
|
||||||
private String positive;
|
private double threshold; //threshold on the similarity measure
|
||||||
private String negative;
|
|
||||||
private String undefined;
|
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private String positive; //specifies the next node in case of positive result: similarity>=th
|
||||||
|
private String negative; //specifies the next node in case of negative result: similarity<th
|
||||||
|
private String undefined; //specifies the next node in case of undefined result: similarity=-1
|
||||||
|
|
||||||
|
boolean ignoreMissing = true; //specifies what to do in case of missing field
|
||||||
|
|
||||||
public TreeNodeDef() {
|
public TreeNodeDef() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public TreeNodeDef(String name, String field, String positive, String negative, String undefined, Map<String, Number> params) {
|
//compute the similarity measure between two documents
|
||||||
this.name = name;
|
public double evaluate(MapDocument doc1, MapDocument doc2) {
|
||||||
this.field = field;
|
|
||||||
|
DescriptiveStatistics stats = new DescriptiveStatistics();
|
||||||
|
|
||||||
|
for (FieldConf fieldConf : fields) {
|
||||||
|
|
||||||
|
double weight = fieldConf.getWeight();
|
||||||
|
|
||||||
|
double similarity = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
|
||||||
|
|
||||||
|
//if similarity is -1 means that a comparator gave undefined, do not add result to the stats
|
||||||
|
if (similarity != -1) {
|
||||||
|
stats.addValue(weight * similarity);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (!ignoreMissing) //if the missing value has not to be ignored, return -1
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (aggregation){
|
||||||
|
|
||||||
|
case AVG:
|
||||||
|
return stats.getMean();
|
||||||
|
case SUM:
|
||||||
|
return stats.getSum();
|
||||||
|
case MAX:
|
||||||
|
return stats.getMax();
|
||||||
|
case MIN:
|
||||||
|
return stats.getMin();
|
||||||
|
default:
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private Comparator comparator(final FieldConf field){
|
||||||
|
|
||||||
|
return PaceConfig.paceResolver.getComparator(field.getComparator(), field.getParams());
|
||||||
|
}
|
||||||
|
|
||||||
|
public TreeNodeDef(List<FieldConf> fields, double threshold, AggType aggregation, String positive, String negative, String undefined) {
|
||||||
|
this.fields = fields;
|
||||||
|
this.threshold = threshold;
|
||||||
|
this.aggregation = aggregation;
|
||||||
this.positive = positive;
|
this.positive = positive;
|
||||||
this.negative = negative;
|
this.negative = negative;
|
||||||
this.undefined = undefined;
|
this.undefined = undefined;
|
||||||
this.params = params;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public TreeNode treeNode() {
|
public boolean isIgnoreMissing() {
|
||||||
try {
|
return ignoreMissing;
|
||||||
return PaceConfig.paceResolver.getTreeNode(getName(), params);
|
|
||||||
} catch (PaceException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getName() {
|
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||||
return name;
|
this.ignoreMissing = ignoreMissing;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setName(String name) {
|
public List<FieldConf> getFields() {
|
||||||
this.name = name;
|
return fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getField() {
|
public void setFields(List<FieldConf> fields) {
|
||||||
return field;
|
this.fields = fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setField(String field) {
|
public double getThreshold() {
|
||||||
this.field = field;
|
return threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setThreshold(double threshold) {
|
||||||
|
this.threshold = threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public AggType getAggregation() {
|
||||||
|
return aggregation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAggregation(AggType aggregation) {
|
||||||
|
this.aggregation = aggregation;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getPositive() {
|
public String getPositive() {
|
||||||
|
@ -81,20 +134,12 @@ public class TreeNodeDef implements Serializable {
|
||||||
this.undefined = undefined;
|
this.undefined = undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Number> getParams() {
|
|
||||||
return params;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setParams(Map<String, Number> params) {
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
try {
|
try {
|
||||||
return new ObjectMapper().writeValueAsString(this);
|
return new ObjectMapper().writeValueAsString(this);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
return e.getStackTrace().toString();
|
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -5,17 +5,17 @@ import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
public class AbstractTreeNode implements TreeNode {
|
abstract class AbstractComparator implements Comparator {
|
||||||
|
|
||||||
Map<String, Number> params;
|
Map<String, Number> params;
|
||||||
|
|
||||||
public AbstractTreeNode(Map<String, Number> params){
|
public AbstractComparator(Map<String, Number> params){
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(Field a, Field b) {
|
public double compare(Field a, Field b) {
|
||||||
return 0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static double stringSimilarity(String s1, String s2) {
|
public static double stringSimilarity(String s1, String s2) {
|
|
@ -6,15 +6,15 @@ import eu.dnetlib.pace.model.FieldList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@TreeNodeClass("coauthorsMatch")
|
@ComparatorClass("coauthorsMatch")
|
||||||
public class CoauthorsMatch extends AbstractTreeNode {
|
public class CoauthorsMatch extends AbstractComparator {
|
||||||
|
|
||||||
public CoauthorsMatch(Map<String, Number> params) {
|
public CoauthorsMatch(Map<String, Number> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(Field a, Field b) {
|
public double compare(Field a, Field b) {
|
||||||
|
|
||||||
final List<String> c1 = ((FieldList) a).stringList();
|
final List<String> c1 = ((FieldList) a).stringList();
|
||||||
final List<String> c2 = ((FieldList) b).stringList();
|
final List<String> c2 = ((FieldList) b).stringList();
|
||||||
|
@ -24,7 +24,7 @@ public class CoauthorsMatch extends AbstractTreeNode {
|
||||||
|
|
||||||
//few coauthors or too many coauthors
|
//few coauthors or too many coauthors
|
||||||
if (size1 < params.getOrDefault("minCoauthors", 5).intValue() || size2 < params.getOrDefault("minCoauthors", 5).intValue() || (size1+size2 > params.getOrDefault("maxCoauthors", 200).intValue()))
|
if (size1 < params.getOrDefault("minCoauthors", 5).intValue() || size2 < params.getOrDefault("minCoauthors", 5).intValue() || (size1+size2 > params.getOrDefault("maxCoauthors", 200).intValue()))
|
||||||
return 0;
|
return -1;
|
||||||
|
|
||||||
int coauthorship = 0;
|
int coauthorship = 0;
|
||||||
for (String ca1: c1){
|
for (String ca1: c1){
|
||||||
|
@ -36,11 +36,7 @@ public class CoauthorsMatch extends AbstractTreeNode {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (coauthorship>=params.getOrDefault("th", 5).intValue())
|
return coauthorship;
|
||||||
return 1;
|
|
||||||
else if (coauthorship == 0)
|
|
||||||
return -1;
|
|
||||||
else
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
|
public interface Comparator {
|
||||||
|
|
||||||
|
//compare two fields and returns: the distace measure, -1 if undefined
|
||||||
|
public double compare(Field a, Field b);
|
||||||
|
|
||||||
|
}
|
|
@ -7,7 +7,7 @@ import java.lang.annotation.Target;
|
||||||
|
|
||||||
@Retention(RetentionPolicy.RUNTIME)
|
@Retention(RetentionPolicy.RUNTIME)
|
||||||
@Target(ElementType.TYPE)
|
@Target(ElementType.TYPE)
|
||||||
public @interface TreeNodeClass {
|
public @interface ComparatorClass {
|
||||||
|
|
||||||
public String value();
|
public String value();
|
||||||
}
|
}
|
|
@ -4,22 +4,22 @@ import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@TreeNodeClass("exactMatch")
|
@ComparatorClass("exactMatch")
|
||||||
public class ExactMatch extends AbstractTreeNode {
|
public class ExactMatch extends AbstractComparator {
|
||||||
|
|
||||||
public ExactMatch(Map<String, Number> params) {
|
public ExactMatch(Map<String, Number> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(Field a, Field b) {
|
public double compare(Field a, Field b) {
|
||||||
|
|
||||||
if (a.stringValue().isEmpty() || b.stringValue().isEmpty())
|
if (a.stringValue().isEmpty() || b.stringValue().isEmpty())
|
||||||
return 0;
|
return -1;
|
||||||
else if (a.stringValue().equals(b.stringValue()))
|
else if (a.stringValue().equals(b.stringValue()))
|
||||||
return 1;
|
return 1;
|
||||||
else
|
else
|
||||||
return -1;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,18 +4,18 @@ import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@TreeNodeClass("similar")
|
@ComparatorClass("similar")
|
||||||
public class SimilarMatch extends AbstractTreeNode {
|
public class SimilarMatch extends AbstractComparator {
|
||||||
|
|
||||||
public SimilarMatch(Map<String, Number> params) {
|
public SimilarMatch(Map<String, Number> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(Field a, Field b) {
|
public double compare(Field a, Field b) {
|
||||||
|
|
||||||
if (a.stringValue().isEmpty() || b.stringValue().isEmpty())
|
if (a.stringValue().isEmpty() || b.stringValue().isEmpty())
|
||||||
return 0; //undefined if one name is missing
|
return -1; //undefined if one name is missing
|
||||||
|
|
||||||
//take only the first name
|
//take only the first name
|
||||||
String firstname1 = a.stringValue().split(" ")[0];
|
String firstname1 = a.stringValue().split(" ")[0];
|
||||||
|
@ -24,12 +24,7 @@ public class SimilarMatch extends AbstractTreeNode {
|
||||||
if (firstname1.toLowerCase().trim().replaceAll("\\.","").replaceAll("\\s","").length()<=2 || firstname2.toLowerCase().replaceAll("\\.", "").replaceAll("\\s","").length()<=2) //too short names (considered similar)
|
if (firstname1.toLowerCase().trim().replaceAll("\\.","").replaceAll("\\s","").length()<=2 || firstname2.toLowerCase().replaceAll("\\.", "").replaceAll("\\s","").length()<=2) //too short names (considered similar)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (stringSimilarity(firstname1,firstname2)>params.getOrDefault("th", 0.7).doubleValue()){
|
return stringSimilarity(firstname1,firstname2);
|
||||||
return 1; //similar names, go on with the analysis
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return -1; //names too different, no need to compare
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,21 +5,21 @@ import eu.dnetlib.pace.model.FieldListImpl;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@TreeNodeClass("topicsMatch")
|
@ComparatorClass("topicsMatch")
|
||||||
public class TopicsMatch extends AbstractTreeNode {
|
public class TopicsMatch extends AbstractComparator {
|
||||||
|
|
||||||
public TopicsMatch(Map<String, Number> params) {
|
public TopicsMatch(Map<String, Number> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(Field a, Field b) {
|
public double compare(Field a, Field b) {
|
||||||
|
|
||||||
double[] t1 = ((FieldListImpl) a).doubleArray();
|
double[] t1 = ((FieldListImpl) a).doubleArray();
|
||||||
double[] t2 = ((FieldListImpl) b).doubleArray();
|
double[] t2 = ((FieldListImpl) b).doubleArray();
|
||||||
|
|
||||||
if (t1 == null || t2 == null)
|
if (t1 == null || t2 == null)
|
||||||
return 0; //0 similarity if no topics in one of the authors or in both
|
return -1; //0 similarity if no topics in one of the authors or in both
|
||||||
|
|
||||||
double area = 0.0;
|
double area = 0.0;
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ public class TopicsMatch extends AbstractTreeNode {
|
||||||
area += min_value[i];
|
area += min_value[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
return area>params.getOrDefault("th", 0.7).doubleValue()?+1:-1;
|
return area;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +0,0 @@
|
||||||
package eu.dnetlib.pace.tree;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
|
|
||||||
public interface TreeNode {
|
|
||||||
|
|
||||||
//compare two fields and returns: +1 if match, 0 if undefined, -1 if do not match
|
|
||||||
public int compare(Field a, Field b);
|
|
||||||
|
|
||||||
}
|
|
|
@ -6,13 +6,13 @@ import eu.dnetlib.pace.model.FieldList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@TreeNodeClass("undefined")
|
@ComparatorClass("undefined")
|
||||||
public class UndefinedNode implements TreeNode {
|
public class UndefinedNode implements Comparator {
|
||||||
|
|
||||||
Map<String, Number> params;
|
Map<String, Number> params;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(Field a, Field b) {
|
public double compare(Field a, Field b) {
|
||||||
|
|
||||||
final List<String> sa = ((FieldList) a).stringList();
|
final List<String> sa = ((FieldList) a).stringList();
|
||||||
final List<String> sb = ((FieldList) b).stringList();
|
final List<String> sb = ((FieldList) b).stringList();
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
|
public enum AggType {
|
||||||
|
|
||||||
|
AVG,
|
||||||
|
SUM,
|
||||||
|
MAX,
|
||||||
|
MIN
|
||||||
|
}
|
|
@ -1,13 +1,12 @@
|
||||||
package eu.dnetlib.pace.tree.support;
|
package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
|
||||||
|
|
||||||
public enum MatchType {
|
public enum MatchType {
|
||||||
|
|
||||||
ORCID_MATCH,
|
ORCID_MATCH,
|
||||||
COAUTHORS_MATCH,
|
COAUTHORS_MATCH,
|
||||||
TOPICS_MATCH,
|
TOPICS_MATCH,
|
||||||
NO_MATCH;
|
NO_MATCH,
|
||||||
|
UNDEFINED;
|
||||||
|
|
||||||
public static MatchType getEnum(String value) {
|
public static MatchType getEnum(String value) {
|
||||||
|
|
||||||
|
@ -15,7 +14,7 @@ public enum MatchType {
|
||||||
return MatchType.valueOf(value);
|
return MatchType.valueOf(value);
|
||||||
}
|
}
|
||||||
catch (IllegalArgumentException e) {
|
catch (IllegalArgumentException e) {
|
||||||
throw new PaceException("The match type is not valid");
|
return MatchType.UNDEFINED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,36 +70,40 @@ public class BlockProcessor {
|
||||||
final String idCurr = curr.getIdentifier();
|
final String idCurr = curr.getIdentifier();
|
||||||
|
|
||||||
//check if pivot and current element are similar by processing the tree
|
//check if pivot and current element are similar by processing the tree
|
||||||
if (navigateTree(pivot, curr))
|
if (navigateTree(pivot, curr)!=MatchType.NO_MATCH)
|
||||||
writeSimilarity(context, idPivot, idCurr);
|
writeSimilarity(context, idPivot, idCurr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean navigateTree(final MapDocument doc1, final MapDocument doc2){
|
private MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){
|
||||||
|
|
||||||
final Map<String, TreeNodeDef> decisionTree = dedupConf.getPace().getDecisionTree();
|
final Map<String, TreeNodeDef> decisionTree = dedupConf.getPace().getDecisionTree();
|
||||||
|
|
||||||
String current = "start";
|
String current = "start";
|
||||||
|
|
||||||
while (!current.equals(MatchType.NO_MATCH.toString()) && !current.equals(MatchType.ORCID_MATCH.toString()) && !current.equals(MatchType.TOPICS_MATCH.toString()) && !current.equals(MatchType.COAUTHORS_MATCH.toString())) {
|
while (MatchType.getEnum(current)==MatchType.UNDEFINED) {
|
||||||
|
|
||||||
TreeNodeDef currentNode = decisionTree.get(current);
|
TreeNodeDef currentNode = decisionTree.get(current);
|
||||||
//throw an exception if the node doesn't exist
|
//throw an exception if the node doesn't exist
|
||||||
if (currentNode == null)
|
if (currentNode == null)
|
||||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
throw new PaceException("The Tree Node doesn't exist: " + current);
|
||||||
|
|
||||||
int compare = currentNode.treeNode().compare(doc1.getFieldMap().get(currentNode.getField()), doc2.getFieldMap().get(currentNode.getField()));
|
double similarity = currentNode.evaluate(doc1, doc2);
|
||||||
|
|
||||||
|
if (similarity == -1) {
|
||||||
|
current = currentNode.getUndefined();
|
||||||
|
}
|
||||||
|
else if (similarity>=currentNode.getThreshold()){
|
||||||
|
current = currentNode.getPositive();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
current = currentNode.getNegative();
|
||||||
|
}
|
||||||
|
|
||||||
current = (compare==0)?currentNode.getUndefined():(compare==-1)?currentNode.getNegative():currentNode.getPositive();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!current.equals(MatchType.NO_MATCH.toString()))
|
return MatchType.getEnum(current);
|
||||||
return true;
|
|
||||||
else
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
|
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
|
||||||
|
|
|
@ -7,8 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass;
|
||||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
import eu.dnetlib.pace.tree.TreeNode;
|
import eu.dnetlib.pace.tree.Comparator;
|
||||||
import eu.dnetlib.pace.tree.TreeNodeClass;
|
import eu.dnetlib.pace.tree.ComparatorClass;
|
||||||
import org.reflections.Reflections;
|
import org.reflections.Reflections;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -22,7 +22,7 @@ public class PaceResolver implements Serializable {
|
||||||
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
||||||
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
|
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
|
||||||
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
|
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
|
||||||
private final Map<String, Class<TreeNode>> treeNodes;
|
private final Map<String, Class<Comparator>> comparators;
|
||||||
|
|
||||||
public PaceResolver() {
|
public PaceResolver() {
|
||||||
|
|
||||||
|
@ -38,9 +38,9 @@ public class PaceResolver implements Serializable {
|
||||||
.filter(DistanceAlgo.class::isAssignableFrom)
|
.filter(DistanceAlgo.class::isAssignableFrom)
|
||||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||||
|
|
||||||
this.treeNodes = new Reflections("eu.dnetlib").getTypesAnnotatedWith(TreeNodeClass.class).stream()
|
this.comparators = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ComparatorClass.class).stream()
|
||||||
.filter(TreeNode.class::isAssignableFrom)
|
.filter(Comparator.class::isAssignableFrom)
|
||||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(TreeNodeClass.class).value(), cl -> (Class<TreeNode>) cl));
|
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>) cl));
|
||||||
}
|
}
|
||||||
|
|
||||||
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
||||||
|
@ -67,9 +67,9 @@ public class PaceResolver implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public TreeNode getTreeNode(String name, Map<String, Number> params) throws PaceException {
|
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
|
||||||
try {
|
try {
|
||||||
return treeNodes.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
|
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
|
||||||
throw new PaceException(name + " not found ", e);
|
throw new PaceException(name + " not found ", e);
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,9 +56,10 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
public void testJaroWinklerNormalizedName2() {
|
public void testJaroWinklerNormalizedName2() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("University of Pisa", "Universita degli studi di Pisa");
|
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York");
|
||||||
|
|
||||||
assertEquals(result, 1.0);
|
assertEquals(result, 1.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue