From 9ff83d6567a494f2e24d14eb3f934e6f63c58166 Mon Sep 17 00:00:00 2001 From: Michele De Bonis Date: Thu, 20 Dec 2018 09:54:41 +0100 Subject: [PATCH] implementation of the decision tree for the deduplication of the authors, implementation of multiple comparators to be used in a tree node and definition of the proto for person entity --- .../resources/eu/dnetlib/pace/authors.json | 2 +- .../eu/dnetlib/pace/authors.test.pace.conf | 15 +-- .../pace/common/AbstractPaceFunctions.java | 27 ++++ .../algo/JaroWinklerNormalizedName.java | 11 +- .../java/eu/dnetlib/pace/model/FieldConf.java | 67 ++++++++++ .../eu/dnetlib/pace/model/TreeNodeDef.java | 117 ++++++++++++------ ...tTreeNode.java => AbstractComparator.java} | 8 +- .../eu/dnetlib/pace/tree/CoauthorsMatch.java | 16 +-- .../java/eu/dnetlib/pace/tree/Comparator.java | 10 ++ ...reeNodeClass.java => ComparatorClass.java} | 2 +- .../java/eu/dnetlib/pace/tree/ExactMatch.java | 10 +- .../eu/dnetlib/pace/tree/SimilarMatch.java | 15 +-- .../eu/dnetlib/pace/tree/TopicsMatch.java | 10 +- .../java/eu/dnetlib/pace/tree/TreeNode.java | 10 -- .../eu/dnetlib/pace/tree/UndefinedNode.java | 6 +- .../eu/dnetlib/pace/tree/support/AggType.java | 9 ++ .../dnetlib/pace/tree/support/MatchType.java | 7 +- .../eu/dnetlib/pace/util/BlockProcessor.java | 26 ++-- .../eu/dnetlib/pace/util/PaceResolver.java | 16 +-- .../pace/distance/DistanceAlgoTest.java | 3 +- 20 files changed, 268 insertions(+), 119 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java rename dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/{AbstractTreeNode.java => AbstractComparator.java} (80%) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java rename dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/{TreeNodeClass.java => ComparatorClass.java} (88%) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNode.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.json b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.json index 185ac73..36c8021 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.json +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.json @@ -1,2 +1,2 @@ { "type": 30, "id": "30|author::id1", "person": { "metadata":{"orcid": "orcid1", "fullname": "smith, john", "firstname": "john", "lastname": "smith", "pubID": "pubid1", "pubDOI": "pubdoi1", "coauthors": ["la bruzzo, sandro", "atzori, claudio", "baglioni, miriam", "bardi, alessia"], "topics": [0.0,0.0,0.0], "rank":1, "area":"1"}}} -{ "type": 30, "id": "30|author::id2", "person": { "metadata":{"orcid": "orcid2", "fullname": "smith, john", "firstname": "john", "lastname": "smith", "pubID": "pubid2", "pubDOI": "pubdoi2", "coauthors": ["la bruzzo, sandro", "atzori, claudio", "baglioni, miriam", "bardi, alessia"], "topics": [0.0,0.0,0.0], "rank":3, "area":"1"}}} \ No newline at end of file +{ "type": 30, "id": "30|author::id2", "person": { "metadata":{"orcid": "", "fullname": "smith, john", "firstname": "john", "lastname": "smith", "pubID": "pubid2", "pubDOI": "pubdoi2", "coauthors": ["la bruzzo, sandro", "atzori, claudio", "baglioni, miriam", "bardi, alessia"], "topics": [0.0,0.0,0.0], "rank":3, "area":"1"}}} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.test.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.test.pace.conf index 31859cc..c804a60 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.test.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.test.pace.conf @@ -16,12 +16,12 @@ ], "conditions": [], "decisionTree": { - "start": {"name": "exactMatch", "field": "pubID", "positive": "NO_MATCH", "negative": "layer2", "undefined": "layer2", "params": {}}, - "layer2": {"name": "exactMatch", "field": "orcid", "positive": "ORCID_MATCH", "negative": "NO_MATCH", "undefined": "layer3", "params": {}}, - "layer3": {"name": "similar", "field": "firstname", "positive": "layer4", "negative": "NO_MATCH", "undefined": "layer4", "params": { "th": 0.7}}, - "layer4": {"name": "coauthorsMatch", "field": "coauthors", "positive": "COAUTHORS_MATCH", "negative": "NO_MATCH", "undefined": "layer5", "params": {"th": 5, "minCoauthors": 6, "maxCoauthors": 200}}, - "layer5": {"name" : "exactMatch", "field": "area", "positive": "layer6", "negative": "NO_MATCH", "undefined": "NO_MATCH", "params": {}}, - "layer6": {"name": "topicsMatch", "field": "topics", "positive": "TOPICS_MATCH", "negative": "NO_MATCH", "undefined": "NO_MATCH", "params": {"th": 0.7}} + "start": {"fields": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"}, + "layer2": {"fields": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"}, + "layer3": {"fields": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"}, + "layer4": {"fields": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"}, + "layer5": {"fields": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}, + "layer6": {"fields": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"} }, "model": [ {"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/fullname"}, @@ -32,7 +32,8 @@ {"name": "topics", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/topics"}, {"name": "pubID", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/pubID"}, {"name": "pubDOI", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/pubDOI"}, - {"name": "rank", "algo": "Null", "type": "Int", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/rank"} + {"name": "rank", "algo": "Null", "type": "Int", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/rank"}, + {"name": "area", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/area"} ], "blacklists": {} } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index c08cae9..7572411 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -193,11 +193,38 @@ public abstract class AbstractPaceFunctions { } public String normalizeCities(String s1, Map cityMap){ + //TODO change normalization mode + for (String city : cityMap.keySet()) s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " "); return s1; } + public String normalizeCities2 (String s1, Map cityMap, int windowSize){ + + List tokens = Arrays.asList(s1.split(" ")); + + if (tokens.size() params; //parameters + + public FieldConf() { + } + + public FieldConf(String field, String comparator, double weight, Map params) { + this.field = field; + this.comparator = comparator; + this.weight = weight; + this.params = params; + } + + public String getField() { + return field; + } + + public void setField(String field) { + this.field = field; + } + + public String getComparator() { + return comparator; + } + + public void setComparator(String comparator) { + this.comparator = comparator; + } + + public double getWeight() { + return weight; + } + + public void setWeight(double weight) { + this.weight = weight; + } + + public Map getParams() { + return params; + } + + public void setParams(Map params) { + this.params = params; + } + + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java index 61b68b3..b1d4917 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/TreeNodeDef.java @@ -1,60 +1,113 @@ package eu.dnetlib.pace.model; import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.tree.TreeNode; +import eu.dnetlib.pace.tree.Comparator; +import eu.dnetlib.pace.tree.support.AggType; import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.codehaus.jackson.map.ObjectMapper; import java.io.IOException; import java.io.Serializable; -import java.util.Map; +import java.util.List; public class TreeNodeDef implements Serializable { - private String name; - private String field; + private List fields; //list of fields involved in the tree node (contains comparators to be used and field on which apply the comparator) + private AggType aggregation; //how to aggregate similarity measures for every field - private String positive; - private String negative; - private String undefined; + private double threshold; //threshold on the similarity measure - private Map params; + private String positive; //specifies the next node in case of positive result: similarity>=th + private String negative; //specifies the next node in case of negative result: similarity params) { - this.name = name; - this.field = field; + //compute the similarity measure between two documents + public double evaluate(MapDocument doc1, MapDocument doc2) { + + DescriptiveStatistics stats = new DescriptiveStatistics(); + + for (FieldConf fieldConf : fields) { + + double weight = fieldConf.getWeight(); + + double similarity = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())); + + //if similarity is -1 means that a comparator gave undefined, do not add result to the stats + if (similarity != -1) { + stats.addValue(weight * similarity); + } + else { + if (!ignoreMissing) //if the missing value has not to be ignored, return -1 + return -1; + } + } + + switch (aggregation){ + + case AVG: + return stats.getMean(); + case SUM: + return stats.getSum(); + case MAX: + return stats.getMax(); + case MIN: + return stats.getMin(); + default: + return 0.0; + } + + } + + private Comparator comparator(final FieldConf field){ + + return PaceConfig.paceResolver.getComparator(field.getComparator(), field.getParams()); + } + + public TreeNodeDef(List fields, double threshold, AggType aggregation, String positive, String negative, String undefined) { + this.fields = fields; + this.threshold = threshold; + this.aggregation = aggregation; this.positive = positive; this.negative = negative; this.undefined = undefined; - this.params = params; } - public TreeNode treeNode() { - try { - return PaceConfig.paceResolver.getTreeNode(getName(), params); - } catch (PaceException e) { - e.printStackTrace(); - return null; - } + public boolean isIgnoreMissing() { + return ignoreMissing; } - public String getName() { - return name; + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; } - public void setName(String name) { - this.name = name; + public List getFields() { + return fields; } - public String getField() { - return field; + public void setFields(List fields) { + this.fields = fields; } - public void setField(String field) { - this.field = field; + public double getThreshold() { + return threshold; + } + + public void setThreshold(double threshold) { + this.threshold = threshold; + } + + public AggType getAggregation() { + return aggregation; + } + + public void setAggregation(AggType aggregation) { + this.aggregation = aggregation; } public String getPositive() { @@ -81,20 +134,12 @@ public class TreeNodeDef implements Serializable { this.undefined = undefined; } - public Map getParams() { - return params; - } - - public void setParams(Map params) { - this.params = params; - } - @Override public String toString() { try { return new ObjectMapper().writeValueAsString(this); } catch (IOException e) { - return e.getStackTrace().toString(); + throw new PaceException("Impossible to convert to JSON: ", e); } } } \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractTreeNode.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java similarity index 80% rename from dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractTreeNode.java rename to dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java index 230ec22..76e41ae 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractTreeNode.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AbstractComparator.java @@ -5,17 +5,17 @@ import org.apache.commons.lang.StringUtils; import java.util.Map; -public class AbstractTreeNode implements TreeNode { +abstract class AbstractComparator implements Comparator { Map params; - public AbstractTreeNode(Map params){ + public AbstractComparator(Map params){ this.params = params; } @Override - public int compare(Field a, Field b) { - return 0; + public double compare(Field a, Field b) { + return 0.0; } public static double stringSimilarity(String s1, String s2) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java index 45bc32a..ace3acc 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CoauthorsMatch.java @@ -6,15 +6,15 @@ import eu.dnetlib.pace.model.FieldList; import java.util.List; import java.util.Map; -@TreeNodeClass("coauthorsMatch") -public class CoauthorsMatch extends AbstractTreeNode { +@ComparatorClass("coauthorsMatch") +public class CoauthorsMatch extends AbstractComparator { public CoauthorsMatch(Map params) { super(params); } @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { final List c1 = ((FieldList) a).stringList(); final List c2 = ((FieldList) b).stringList(); @@ -24,7 +24,7 @@ public class CoauthorsMatch extends AbstractTreeNode { //few coauthors or too many coauthors if (size1 < params.getOrDefault("minCoauthors", 5).intValue() || size2 < params.getOrDefault("minCoauthors", 5).intValue() || (size1+size2 > params.getOrDefault("maxCoauthors", 200).intValue())) - return 0; + return -1; int coauthorship = 0; for (String ca1: c1){ @@ -36,11 +36,7 @@ public class CoauthorsMatch extends AbstractTreeNode { } } - if (coauthorship>=params.getOrDefault("th", 5).intValue()) - return 1; - else if (coauthorship == 0) - return -1; - else - return 0; + return coauthorship; + } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java new file mode 100644 index 0000000..087028b --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Comparator.java @@ -0,0 +1,10 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.model.Field; + +public interface Comparator { + + //compare two fields and returns: the distace measure, -1 if undefined + public double compare(Field a, Field b); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNodeClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java similarity index 88% rename from dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNodeClass.java rename to dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java index 3db53ea..a04fba8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNodeClass.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ComparatorClass.java @@ -7,7 +7,7 @@ import java.lang.annotation.Target; @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.TYPE) -public @interface TreeNodeClass { +public @interface ComparatorClass { public String value(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java index b03bae6..8e0e601 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -4,22 +4,22 @@ import eu.dnetlib.pace.model.Field; import java.util.Map; -@TreeNodeClass("exactMatch") -public class ExactMatch extends AbstractTreeNode { +@ComparatorClass("exactMatch") +public class ExactMatch extends AbstractComparator { public ExactMatch(Map params) { super(params); } @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { if (a.stringValue().isEmpty() || b.stringValue().isEmpty()) - return 0; + return -1; else if (a.stringValue().equals(b.stringValue())) return 1; else - return -1; + return 0; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java index fd52db2..f8f5fe1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SimilarMatch.java @@ -4,18 +4,18 @@ import eu.dnetlib.pace.model.Field; import java.util.Map; -@TreeNodeClass("similar") -public class SimilarMatch extends AbstractTreeNode { +@ComparatorClass("similar") +public class SimilarMatch extends AbstractComparator { public SimilarMatch(Map params) { super(params); } @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { if (a.stringValue().isEmpty() || b.stringValue().isEmpty()) - return 0; //undefined if one name is missing + return -1; //undefined if one name is missing //take only the first name String firstname1 = a.stringValue().split(" ")[0]; @@ -24,12 +24,7 @@ public class SimilarMatch extends AbstractTreeNode { if (firstname1.toLowerCase().trim().replaceAll("\\.","").replaceAll("\\s","").length()<=2 || firstname2.toLowerCase().replaceAll("\\.", "").replaceAll("\\s","").length()<=2) //too short names (considered similar) return 1; - if (stringSimilarity(firstname1,firstname2)>params.getOrDefault("th", 0.7).doubleValue()){ - return 1; //similar names, go on with the analysis - } - else { - return -1; //names too different, no need to compare - } + return stringSimilarity(firstname1,firstname2); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java index 67d05bb..ea798c7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TopicsMatch.java @@ -5,21 +5,21 @@ import eu.dnetlib.pace.model.FieldListImpl; import java.util.Map; -@TreeNodeClass("topicsMatch") -public class TopicsMatch extends AbstractTreeNode { +@ComparatorClass("topicsMatch") +public class TopicsMatch extends AbstractComparator { public TopicsMatch(Map params) { super(params); } @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { double[] t1 = ((FieldListImpl) a).doubleArray(); double[] t2 = ((FieldListImpl) b).doubleArray(); if (t1 == null || t2 == null) - return 0; //0 similarity if no topics in one of the authors or in both + return -1; //0 similarity if no topics in one of the authors or in both double area = 0.0; @@ -30,7 +30,7 @@ public class TopicsMatch extends AbstractTreeNode { area += min_value[i]; } - return area>params.getOrDefault("th", 0.7).doubleValue()?+1:-1; + return area; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNode.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNode.java deleted file mode 100644 index 6c7eb9e..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TreeNode.java +++ /dev/null @@ -1,10 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.model.Field; - -public interface TreeNode { - - //compare two fields and returns: +1 if match, 0 if undefined, -1 if do not match - public int compare(Field a, Field b); - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UndefinedNode.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UndefinedNode.java index 1572407..cf90847 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UndefinedNode.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UndefinedNode.java @@ -6,13 +6,13 @@ import eu.dnetlib.pace.model.FieldList; import java.util.List; import java.util.Map; -@TreeNodeClass("undefined") -public class UndefinedNode implements TreeNode { +@ComparatorClass("undefined") +public class UndefinedNode implements Comparator { Map params; @Override - public int compare(Field a, Field b) { + public double compare(Field a, Field b) { final List sa = ((FieldList) a).stringList(); final List sb = ((FieldList) b).stringList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java new file mode 100644 index 0000000..71e3ad0 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -0,0 +1,9 @@ +package eu.dnetlib.pace.tree.support; + +public enum AggType { + + AVG, + SUM, + MAX, + MIN +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java index 2160f99..158d3f9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java @@ -1,13 +1,12 @@ package eu.dnetlib.pace.tree.support; -import eu.dnetlib.pace.util.PaceException; - public enum MatchType { ORCID_MATCH, COAUTHORS_MATCH, TOPICS_MATCH, - NO_MATCH; + NO_MATCH, + UNDEFINED; public static MatchType getEnum(String value) { @@ -15,7 +14,7 @@ public enum MatchType { return MatchType.valueOf(value); } catch (IllegalArgumentException e) { - throw new PaceException("The match type is not valid"); + return MatchType.UNDEFINED; } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 02a34b2..19105ae 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -70,36 +70,40 @@ public class BlockProcessor { final String idCurr = curr.getIdentifier(); //check if pivot and current element are similar by processing the tree - if (navigateTree(pivot, curr)) + if (navigateTree(pivot, curr)!=MatchType.NO_MATCH) writeSimilarity(context, idPivot, idCurr); } - - } } - private boolean navigateTree(final MapDocument doc1, final MapDocument doc2){ + private MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){ final Map decisionTree = dedupConf.getPace().getDecisionTree(); String current = "start"; - while (!current.equals(MatchType.NO_MATCH.toString()) && !current.equals(MatchType.ORCID_MATCH.toString()) && !current.equals(MatchType.TOPICS_MATCH.toString()) && !current.equals(MatchType.COAUTHORS_MATCH.toString())) { + while (MatchType.getEnum(current)==MatchType.UNDEFINED) { TreeNodeDef currentNode = decisionTree.get(current); //throw an exception if the node doesn't exist if (currentNode == null) throw new PaceException("The Tree Node doesn't exist: " + current); - int compare = currentNode.treeNode().compare(doc1.getFieldMap().get(currentNode.getField()), doc2.getFieldMap().get(currentNode.getField())); + double similarity = currentNode.evaluate(doc1, doc2); + + if (similarity == -1) { + current = currentNode.getUndefined(); + } + else if (similarity>=currentNode.getThreshold()){ + current = currentNode.getPositive(); + } + else { + current = currentNode.getNegative(); + } - current = (compare==0)?currentNode.getUndefined():(compare==-1)?currentNode.getNegative():currentNode.getPositive(); } - if (!current.equals(MatchType.NO_MATCH.toString())) - return true; - else - return false; + return MatchType.getEnum(current); } private Queue prepare(final Iterable documents) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index 00ca626..d827654 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -7,8 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass; import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.FieldDef; -import eu.dnetlib.pace.tree.TreeNode; -import eu.dnetlib.pace.tree.TreeNodeClass; +import eu.dnetlib.pace.tree.Comparator; +import eu.dnetlib.pace.tree.ComparatorClass; import org.reflections.Reflections; import java.io.Serializable; @@ -22,7 +22,7 @@ public class PaceResolver implements Serializable { private final Map> clusteringFunctions; private final Map> conditionAlgos; private final Map> distanceAlgos; - private final Map> treeNodes; + private final Map> comparators; public PaceResolver() { @@ -38,9 +38,9 @@ public class PaceResolver implements Serializable { .filter(DistanceAlgo.class::isAssignableFrom) .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl)); - this.treeNodes = new Reflections("eu.dnetlib").getTypesAnnotatedWith(TreeNodeClass.class).stream() - .filter(TreeNode.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(TreeNodeClass.class).value(), cl -> (Class) cl)); + this.comparators = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ComparatorClass.class).stream() + .filter(Comparator.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class) cl)); } public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { @@ -67,9 +67,9 @@ public class PaceResolver implements Serializable { } } - public TreeNode getTreeNode(String name, Map params) throws PaceException { + public Comparator getComparator(String name, Map params) throws PaceException { try { - return treeNodes.get(name).getDeclaredConstructor(Map.class).newInstance(params); + return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { throw new PaceException(name + " not found ", e); } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 4554063..883dde5 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -56,9 +56,10 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName2() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("University of Pisa", "Universita degli studi di Pisa"); + double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York"); assertEquals(result, 1.0); } + }