diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index c08cae97d..757241192 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -193,11 +193,38 @@ public abstract class AbstractPaceFunctions { } public String normalizeCities(String s1, Map cityMap){ + //TODO change normalization mode + for (String city : cityMap.keySet()) s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " "); return s1; } + public String normalizeCities2 (String s1, Map cityMap, int windowSize){ + + List tokens = Arrays.asList(s1.split(" ")); + + if (tokens.size() params; //parameters + + public FieldConf() { + } + + public FieldConf(String field, String comparator, double weight, Map params) { + this.field = field; + this.comparator = comparator; + this.weight = weight; + this.params = params; + } + + public String getField() { + return field; + } + + public void setField(String field) { + this.field = field; + } + + public String getComparator() { + return comparator; + } + + public void setComparator(String comparator) { + this.comparator = comparator; + } + + public double getWeight() { + return weight; + } + + public void setWeight(double weight) { + this.weight = weight; + } + + public Map getParams() { + return params; + } + + public void setParams(Map params) { + this.params = params; + } + + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java index 61b68b30e..b1d4917b3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java @@ -1,60 +1,113 @@ package eu.dnetlib.pace.model; import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.tree.TreeNode; +import eu.dnetlib.pace.tree.Comparator; +import eu.dnetlib.pace.tree.support.AggType; import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.codehaus.jackson.map.ObjectMapper; import java.io.IOException; import java.io.Serializable; -import java.util.Map; +import java.util.List; public class TreeNodeDef implements Serializable { - private String name; - private String field; + private List fields; //list of fields involved in the tree node (contains comparators to be used and field on which apply the comparator) + private AggType aggregation; //how to aggregate similarity measures for every field - private String positive; - private String negative; - private String undefined; + private double threshold; //threshold on the similarity measure - private Map params; + private String positive; //specifies the next node in case of positive result: similarity>=th + private String negative; //specifies the next node in case of negative result: similarity params) { - this.name = name; - this.field = field; + //compute the similarity measure between two documents + public double evaluate(MapDocument doc1, MapDocument doc2) { + + DescriptiveStatistics stats = new DescriptiveStatistics(); + + for (FieldConf fieldConf : fields) { + + double weight = fieldConf.getWeight(); + + double similarity = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())); + + //if similarity is -1 means that a comparator gave undefined, do not add result to the stats + if (similarity != -1) { + stats.addValue(weight * similarity); + } + else { + if (!ignoreMissing) //if the missing value has not to be ignored, return -1 + return -1; + } + } + + switch (aggregation){ + + case AVG: + return stats.getMean(); + case SUM: + return stats.getSum(); + case MAX: + return stats.getMax(); + case MIN: + return stats.getMin(); + default: + return 0.0; + } + + } + + private Comparator comparator(final FieldConf field){ + + return PaceConfig.paceResolver.getComparator(field.getComparator(), field.getParams()); + } + + public TreeNodeDef(List fields, double threshold, AggType aggregation, String positive, String negative, String undefined) { + this.fields = fields; + this.threshold = threshold; + this.aggregation = aggregation; this.positive = positive; this.negative = negative; this.undefined = undefined; - this.params = params; } - public TreeNode treeNode() { - try { - return PaceConfig.paceResolver.getTreeNode(getName(), params); - } catch (PaceException e) { - e.printStackTrace(); - return null; - } + public boolean isIgnoreMissing() { + return ignoreMissing; } - public String getName() { - return name; + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; } - public void setName(String name) { - this.name = name; + public List getFields() { + return fields; } - public String getField() { - return field; + public void setFields(List fields) { + this.fields = fields; } - public void setField(String field) { - this.field = field; + public double getThreshold() { + return threshold; + } + + public void setThreshold(double threshold) { + this.threshold = threshold; + } + + public AggType getAggregation() { + return aggregation; + } + + public void setAggregation(AggType aggregation) { + this.aggregation = aggregation; } public String getPositive() { @@ -81,20 +134,12 @@ public class TreeNodeDef implements Serializable { this.undefined = undefined; } - public Map getParams() { - return params; - } - - public void setParams(Map params) { - this.params = params; - } - @Override public String toString() { try { return new ObjectMapper().writeValueAsString(this); } catch (IOException e) { - return e.getStackTrace().toString(); + throw new PaceException("Impossible to convert to JSON: ", e); } } } \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractTreeNode.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java similarity index 80% rename from dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractTreeNode.java rename to dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java index 230ec227c..76e41ae9c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractTreeNode.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java @@ -5,17 +5,17 @@ import org.apache.commons.lang.StringUtils; import java.util.Map; -public class AbstractTreeNode implements TreeNode { +abstract class AbstractComparator implements Comparator { Map params; - public AbstractTreeNode(Map params){ + public AbstractComparator(Map params){ this.params = params; } @Override - public int compare(Field a, Field b) { - return 0; + public double compare(Field a, Field b) { + return 0.0; } public static double stringSimilarity(String s1, String s2) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java index 45bc32af1..ace3acc21 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java @@ -6,15 +6,15 @@ import eu.dnetlib.pace.model.FieldList; import java.util.List; import java.util.Map; -@TreeNodeClass("coauthorsMatch") -public class CoauthorsMatch extends AbstractTreeNode { +@ComparatorClass("coauthorsMatch") +public class CoauthorsMatch extends AbstractComparator { public CoauthorsMatch(Map params) { super(params); } @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { final List c1 = ((FieldList) a).stringList(); final List c2 = ((FieldList) b).stringList(); @@ -24,7 +24,7 @@ public class CoauthorsMatch extends AbstractTreeNode { //few coauthors or too many coauthors if (size1 < params.getOrDefault("minCoauthors", 5).intValue() || size2 < params.getOrDefault("minCoauthors", 5).intValue() || (size1+size2 > params.getOrDefault("maxCoauthors", 200).intValue())) - return 0; + return -1; int coauthorship = 0; for (String ca1: c1){ @@ -36,11 +36,7 @@ public class CoauthorsMatch extends AbstractTreeNode { } } - if (coauthorship>=params.getOrDefault("th", 5).intValue()) - return 1; - else if (coauthorship == 0) - return -1; - else - return 0; + return coauthorship; + } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java new file mode 100644 index 000000000..087028ba2 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java @@ -0,0 +1,10 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.model.Field; + +public interface Comparator { + + //compare two fields and returns: the distace measure, -1 if undefined + public double compare(Field a, Field b); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNodeClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java similarity index 88% rename from dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNodeClass.java rename to dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java index 3db53eaa2..a04fba8ee 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNodeClass.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java @@ -7,7 +7,7 @@ import java.lang.annotation.Target; @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.TYPE) -public @interface TreeNodeClass { +public @interface ComparatorClass { public String value(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java index b03bae6ec..8e0e60173 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -4,22 +4,22 @@ import eu.dnetlib.pace.model.Field; import java.util.Map; -@TreeNodeClass("exactMatch") -public class ExactMatch extends AbstractTreeNode { +@ComparatorClass("exactMatch") +public class ExactMatch extends AbstractComparator { public ExactMatch(Map params) { super(params); } @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { if (a.stringValue().isEmpty() || b.stringValue().isEmpty()) - return 0; + return -1; else if (a.stringValue().equals(b.stringValue())) return 1; else - return -1; + return 0; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java index fd52db2b9..f8f5fe144 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java @@ -4,18 +4,18 @@ import eu.dnetlib.pace.model.Field; import java.util.Map; -@TreeNodeClass("similar") -public class SimilarMatch extends AbstractTreeNode { +@ComparatorClass("similar") +public class SimilarMatch extends AbstractComparator { public SimilarMatch(Map params) { super(params); } @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { if (a.stringValue().isEmpty() || b.stringValue().isEmpty()) - return 0; //undefined if one name is missing + return -1; //undefined if one name is missing //take only the first name String firstname1 = a.stringValue().split(" ")[0]; @@ -24,12 +24,7 @@ public class SimilarMatch extends AbstractTreeNode { if (firstname1.toLowerCase().trim().replaceAll("\\.","").replaceAll("\\s","").length()<=2 || firstname2.toLowerCase().replaceAll("\\.", "").replaceAll("\\s","").length()<=2) //too short names (considered similar) return 1; - if (stringSimilarity(firstname1,firstname2)>params.getOrDefault("th", 0.7).doubleValue()){ - return 1; //similar names, go on with the analysis - } - else { - return -1; //names too different, no need to compare - } + return stringSimilarity(firstname1,firstname2); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java index 67d05bbcd..ea798c7a7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java @@ -5,21 +5,21 @@ import eu.dnetlib.pace.model.FieldListImpl; import java.util.Map; -@TreeNodeClass("topicsMatch") -public class TopicsMatch extends AbstractTreeNode { +@ComparatorClass("topicsMatch") +public class TopicsMatch extends AbstractComparator { public TopicsMatch(Map params) { super(params); } @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { double[] t1 = ((FieldListImpl) a).doubleArray(); double[] t2 = ((FieldListImpl) b).doubleArray(); if (t1 == null || t2 == null) - return 0; //0 similarity if no topics in one of the authors or in both + return -1; //0 similarity if no topics in one of the authors or in both double area = 0.0; @@ -30,7 +30,7 @@ public class TopicsMatch extends AbstractTreeNode { area += min_value[i]; } - return area>params.getOrDefault("th", 0.7).doubleValue()?+1:-1; + return area; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNode.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNode.java deleted file mode 100644 index 6c7eb9eb0..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNode.java +++ /dev/null @@ -1,10 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.model.Field; - -public interface TreeNode { - - //compare two fields and returns: +1 if match, 0 if undefined, -1 if do not match - public int compare(Field a, Field b); - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UndefinedNode.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UndefinedNode.java index 157240739..cf90847de 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UndefinedNode.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UndefinedNode.java @@ -6,13 +6,13 @@ import eu.dnetlib.pace.model.FieldList; import java.util.List; import java.util.Map; -@TreeNodeClass("undefined") -public class UndefinedNode implements TreeNode { +@ComparatorClass("undefined") +public class UndefinedNode implements Comparator { Map params; @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { final List sa = ((FieldList) a).stringList(); final List sb = ((FieldList) b).stringList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java new file mode 100644 index 000000000..71e3ad0d4 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -0,0 +1,9 @@ +package eu.dnetlib.pace.tree.support; + +public enum AggType { + + AVG, + SUM, + MAX, + MIN +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java index 2160f9954..158d3f99f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java @@ -1,13 +1,12 @@ package eu.dnetlib.pace.tree.support; -import eu.dnetlib.pace.util.PaceException; - public enum MatchType { ORCID_MATCH, COAUTHORS_MATCH, TOPICS_MATCH, - NO_MATCH; + NO_MATCH, + UNDEFINED; public static MatchType getEnum(String value) { @@ -15,7 +14,7 @@ public enum MatchType { return MatchType.valueOf(value); } catch (IllegalArgumentException e) { - throw new PaceException("The match type is not valid"); + return MatchType.UNDEFINED; } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 02a34b253..19105aefc 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -70,36 +70,40 @@ public class BlockProcessor { final String idCurr = curr.getIdentifier(); //check if pivot and current element are similar by processing the tree - if (navigateTree(pivot, curr)) + if (navigateTree(pivot, curr)!=MatchType.NO_MATCH) writeSimilarity(context, idPivot, idCurr); } - - } } - private boolean navigateTree(final MapDocument doc1, final MapDocument doc2){ + private MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){ final Map decisionTree = dedupConf.getPace().getDecisionTree(); String current = "start"; - while (!current.equals(MatchType.NO_MATCH.toString()) && !current.equals(MatchType.ORCID_MATCH.toString()) && !current.equals(MatchType.TOPICS_MATCH.toString()) && !current.equals(MatchType.COAUTHORS_MATCH.toString())) { + while (MatchType.getEnum(current)==MatchType.UNDEFINED) { TreeNodeDef currentNode = decisionTree.get(current); //throw an exception if the node doesn't exist if (currentNode == null) throw new PaceException("The Tree Node doesn't exist: " + current); - int compare = currentNode.treeNode().compare(doc1.getFieldMap().get(currentNode.getField()), doc2.getFieldMap().get(currentNode.getField())); + double similarity = currentNode.evaluate(doc1, doc2); + + if (similarity == -1) { + current = currentNode.getUndefined(); + } + else if (similarity>=currentNode.getThreshold()){ + current = currentNode.getPositive(); + } + else { + current = currentNode.getNegative(); + } - current = (compare==0)?currentNode.getUndefined():(compare==-1)?currentNode.getNegative():currentNode.getPositive(); } - if (!current.equals(MatchType.NO_MATCH.toString())) - return true; - else - return false; + return MatchType.getEnum(current); } private Queue prepare(final Iterable documents) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index 00ca6265f..d8276549a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -7,8 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass; import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.FieldDef; -import eu.dnetlib.pace.tree.TreeNode; -import eu.dnetlib.pace.tree.TreeNodeClass; +import eu.dnetlib.pace.tree.Comparator; +import eu.dnetlib.pace.tree.ComparatorClass; import org.reflections.Reflections; import java.io.Serializable; @@ -22,7 +22,7 @@ public class PaceResolver implements Serializable { private final Map> clusteringFunctions; private final Map> conditionAlgos; private final Map> distanceAlgos; - private final Map> treeNodes; + private final Map> comparators; public PaceResolver() { @@ -38,9 +38,9 @@ public class PaceResolver implements Serializable { .filter(DistanceAlgo.class::isAssignableFrom) .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl)); - this.treeNodes = new Reflections("eu.dnetlib").getTypesAnnotatedWith(TreeNodeClass.class).stream() - .filter(TreeNode.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(TreeNodeClass.class).value(), cl -> (Class) cl)); + this.comparators = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ComparatorClass.class).stream() + .filter(Comparator.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class) cl)); } public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { @@ -67,9 +67,9 @@ public class PaceResolver implements Serializable { } } - public TreeNode getTreeNode(String name, Map params) throws PaceException { + public Comparator getComparator(String name, Map params) throws PaceException { try { - return treeNodes.get(name).getDeclaredConstructor(Map.class).newInstance(params); + return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { throw new PaceException(name + " not found ", e); } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 45540636c..883dde57c 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -56,9 +56,10 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName2() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("University of Pisa", "Universita degli studi di Pisa"); + double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York"); assertEquals(result, 1.0); } + }