From ddd40540aa7f9e09faa5f5e45f03b5f499b8de04 Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 20 Nov 2019 10:45:00 +0100 Subject: [PATCH] jarowinklernormalizedname splitted in 3 different comparators: citymatch, keywordmatch and jarowinkler. Implementation of the TreeStatistic support functions --- .../clustering/RandomClusteringFunction.java | 1 - .../pace/common/AbstractPaceFunctions.java | 13 ++- .../java/eu/dnetlib/pace/tree/CityMatch.java | 47 ++++++++ .../pace/tree/JaroWinklerNormalizedName.java | 41 ++----- .../eu/dnetlib/pace/tree/KeywordMatch.java | 47 ++++++++ .../dnetlib/pace/tree/support/FieldStats.java | 54 +++++++++ .../pace/tree/support/TreeNodeDef.java | 12 +- .../pace/tree/support/TreeNodeStats.java | 106 ++++++++++-------- .../pace/tree/support/TreeProcessor.java | 17 ++- .../dnetlib/pace/tree/support/TreeStats.java | 51 +++++++++ .../pace/comparators/ComparatorTest.java | 21 ++++ .../java/eu/dnetlib/pace/util/UtilTest.java | 23 ++++ 12 files changed, 334 insertions(+), 99 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index c485fcb9a..86a2e4e4f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -13,7 +13,6 @@ public class RandomClusteringFunction extends AbstractClusteringFunction { @Override protected Collection doApply(final Config conf, String s) { - // TODO Auto-generated method stub return null; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 8aeb36a0f..71954a394 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -216,14 +216,19 @@ public abstract class AbstractPaceFunctions { Set k1 = keywordsToCodes(s1, translationMap); Set k2 = keywordsToCodes(s2, translationMap); - int longer = (k1.size()>k2.size())?k1.size():k2.size(); - if (k1.isEmpty() || k2.isEmpty()) return 1.0; - else - return (double)CollectionUtils.intersection(k1,k2).size()/(double)longer; + + return commonElementsPercentage(k1, k2); } + public double commonElementsPercentage(Set s1, Set s2){ + + int longer = (s1.size()>s2.size())?s1.size():s2.size(); + + return (double)CollectionUtils.intersection(s1,s2).size()/(double)longer; + } + //convert the set of keywords to codes public Set toCodes(Set keywords, Map translationMap) { return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java new file mode 100644 index 000000000..e56066814 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java @@ -0,0 +1,47 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; +import java.util.Set; + +@ComparatorClass("cityMatch") +public class CityMatch extends AbstractComparator { + + private Map params; + + public CityMatch(Map params) { + super(params); + this.params = params; + } + + @Override + public double distance(final String a, final String b, final Config conf) { + + String ca = cleanup(a); + String cb = cleanup(b); + + ca = normalize(ca); + cb = normalize(cb); + + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); + + Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); + Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); + + Set codes1 = citiesToCodes(cities1); + Set codes2 = citiesToCodes(cities2); + + //if no cities are detected, the comparator gives 1.0 + if (codes1.isEmpty() && codes2.isEmpty()) + return 1.0; + else { + if (codes1.isEmpty() ^ codes2.isEmpty()) + return -1; //undefined if one of the two has no cities + return commonElementsPercentage(codes1, codes2) > params.getOrDefault("threshold", 0).intValue() ? 1.0 : 0.0; + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index 6899be109..230bce8aa 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -45,40 +45,15 @@ public class JaroWinklerNormalizedName extends AbstractComparator { Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); - if (checkCities(cities1,cities2)) { + ca = removeKeywords(ca, keywords1); + ca = removeKeywords(ca, cities1); + cb = removeKeywords(cb, keywords2); + cb = removeKeywords(cb, cities2); - if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) { - - ca = removeKeywords(ca, keywords1); - ca = removeKeywords(ca, cities1); - cb = removeKeywords(cb, keywords2); - cb = removeKeywords(cb, cities2); - - if (ca.isEmpty() && cb.isEmpty()) - return 1.0; - else - return normalize(ssalgo.score(ca,cb)); - - } - } - - return 0.0; - } - - //returns true if at least 1 city is in common - //returns true if no cities are contained in names - //returns false if one of the two names have no city - public boolean checkCities(Set s1, Set s2){ - Set c1 = citiesToCodes(s1); - Set c2 = citiesToCodes(s2); - - if (c1.isEmpty() && c2.isEmpty()) - return true; - else { - if (c1.isEmpty() ^ c2.isEmpty()) - return false; - return CollectionUtils.intersection(c1, c2).size() > 0; - } + if (ca.isEmpty() && cb.isEmpty()) + return 1.0; + else + return normalize(ssalgo.score(ca,cb)); } @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java new file mode 100644 index 000000000..ee51acc9b --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java @@ -0,0 +1,47 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; +import java.util.Set; + +@ComparatorClass("keywordMatch") +public class KeywordMatch extends AbstractComparator { + + Map params; + + public KeywordMatch(Map params) { + super(params); + this.params = params; + } + + @Override + public double distance(final String a, final String b, final Config conf) { + + String ca = cleanup(a); + String cb = cleanup(b); + + ca = normalize(ca); + cb = normalize(cb); + + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); + + Set keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); + Set keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); + + Set codes1 = toCodes(keywords1, conf.translationMap()); + Set codes2 = toCodes(keywords2, conf.translationMap()); + + //if no cities are detected, the comparator gives 1.0 + if (codes1.isEmpty() && codes2.isEmpty()) + return 1.0; + else { + if (codes1.isEmpty() ^ codes2.isEmpty()) + return -1; //undefined if one of the two has no keywords + return commonElementsPercentage(codes1, codes2) > params.getOrDefault("threshold", 0).intValue() ? 1.0 : 0.0; + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java new file mode 100644 index 000000000..2f1decd32 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -0,0 +1,54 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.util.PaceException; +import org.codehaus.jackson.map.ObjectMapper; + +import java.io.IOException; +import java.io.Serializable; + +public class FieldStats implements Serializable { + + private double weight; //weight for the field (to be used in the aggregation) + private double result; //the result of the comparison + + private boolean countIfUndefined; + + public FieldStats(double weight, double result, boolean countIfUndefined) { + this.weight = weight; + this.result = result; + this.countIfUndefined = countIfUndefined; + } + + public double getWeight() { + return weight; + } + + public void setWeight(double weight) { + this.weight = weight; + } + + public double getResult() { + return result; + } + + public void setResult(double result) { + this.result = result; + } + + public boolean isCountIfUndefined() { + return countIfUndefined; + } + + public void setCountIfUndefined(boolean countIfUndefined) { + this.countIfUndefined = countIfUndefined; + } + + @Override + public String toString(){ + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 4f95ad366..3f9bdce1e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -39,7 +39,6 @@ public class TreeNodeDef implements Serializable { public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) { TreeNodeStats stats = new TreeNodeStats(); - stats.setFieldsCount(fields.size()); for (FieldConf fieldConf : fields) { @@ -47,16 +46,7 @@ public class TreeNodeDef implements Serializable { double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); - if (result == -1) { //if the comparison is undefined - stats.incrementUndefinedCount(); - if (fieldConf.isCountIfUndefined()) { //if it must be taken into account, increment weights (i.e. the average would be lower) - stats.incrementWeightsSum(weight); - } - } - else { //if the field is not missing - stats.incrementScoresSum(weight * result); - stats.incrementWeightsSum(weight); - } + stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField(), new FieldStats(weight, result, fieldConf.isCountIfUndefined())); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java index 8d313e8eb..c5b1d0fcf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java @@ -1,90 +1,108 @@ package eu.dnetlib.pace.tree.support; -import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; - import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; public class TreeNodeStats implements Serializable { - private DescriptiveStatistics stats; - private int undefinedCount = 0; //counter for the number of undefined comparisons between the fields in the tree node - private int fieldsCount = 0; - private double weightsSum = 0.0; + private Map results; //this is an accumulator for the results of the node public TreeNodeStats(){ - this.stats = new DescriptiveStatistics(); + this.results = new HashMap<>(); } - public TreeNodeStats(int undefinedCount, int fieldsCount, double weightsSum) { - this.undefinedCount = undefinedCount; - this.fieldsCount = fieldsCount; - this.weightsSum = weightsSum; + public Map getResults() { + return results; } - public DescriptiveStatistics getStats() { - return stats; + public void addFieldStats(String id, FieldStats fieldStats){ + this.results.put(id, fieldStats); } - public void setStats(DescriptiveStatistics stats) { - this.stats = stats; + public int fieldsCount(){ + return this.results.size(); } - public int getUndefinedCount() { + public int undefinedCount(){ + int undefinedCount = 0; + for(FieldStats fs: this.results.values()){ + if(fs.getResult() == -1) + undefinedCount ++; + } return undefinedCount; } - public void setUndefinedCount(int undefinedCount) { - this.undefinedCount = undefinedCount; + public double scoreSum(){ + double scoreSum = 0.0; + for(FieldStats fs: this.results.values()){ + if(fs.getResult()>=0.0) { + scoreSum += fs.getResult(); + } + } + return scoreSum; } - public int getFieldsCount() { - return fieldsCount; + //return the sum of the weights without considering the fields with countIfMissing=false && result=-1 + public double weightSum(){ + double weightSum = 0.0; + for(FieldStats fs: this.results.values()){ + if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) { + weightSum += fs.getWeight(); + } + } + return weightSum; } - public void setFieldsCount(int fields) { - this.fieldsCount = fields; + public double weightedScoreSum(){ + double weightedScoreSum = 0.0; + for(FieldStats fs: this.results.values()){ + if(fs.getResult()>=0.0) { + weightedScoreSum += fs.getResult()*fs.getWeight(); + } + } + return weightedScoreSum; } - public double getWeightsSum() { - return weightsSum; + public double max(){ + double max = -1.0; + for(FieldStats fs: this.results.values()){ + if(fs.getResult()>max) + max = fs.getResult(); + } + return max; } - public void setWeightsSum(double weightsSum) { - this.weightsSum = weightsSum; - } - - public void incrementWeightsSum(double delta){ - this.weightsSum += delta; - } - - public void incrementUndefinedCount(){ - this.undefinedCount += 1; - } - - public void incrementScoresSum(double delta){ - this.stats.addValue(delta); + public double min(){ + double min = 100.0; //random high value + for(FieldStats fs: this.results.values()){ + if(fs.getResult()=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined())) + min = fs.getResult(); + } + } + return min; } public double getFinalScore(AggType aggregation){ switch (aggregation){ case AVG: - return stats.getMean(); + return scoreSum()/fieldsCount(); case SUM: - return stats.getSum(); + return scoreSum(); case SC: case OR: case MAX: - return stats.getMax(); + return max(); case NC: case AND: case MIN: - return stats.getMin(); + return min(); case W_MEAN: - return stats.getSum()/weightsSum; + return weightedScoreSum()/weightSum(); default: return 0.0; } } - } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index 5663a79dd..23c9a3ea8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -6,12 +6,13 @@ import eu.dnetlib.pace.util.PaceException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import java.io.Serializable; import java.util.Map; /** * The compare between two documents is given by the weighted mean of the field distances */ -public class TreeProcessor { +public class TreeProcessor{ private static final Log log = LogFactory.getLog(TreeProcessor.class); @@ -24,10 +25,12 @@ public class TreeProcessor { public boolean compare(final MapDocument a, final MapDocument b) { //evaluate the decision tree - return evaluateTree(a, b) == MatchType.MATCH; + return evaluateTree(a, b).getResult() == MatchType.MATCH; } - public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2){ + public TreeStats evaluateTree(final MapDocument doc1, final MapDocument doc2){ + + TreeStats treeStats = new TreeStats(); String current = "start"; @@ -39,9 +42,10 @@ public class TreeProcessor { throw new PaceException("The Tree Node doesn't exist: " + current); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); + treeStats.addNodeStats(current, stats); //if ignoreUndefined=false the miss is considered as undefined - if (!currentNode.isIgnoreUndefined() && stats.getUndefinedCount()>0) { + if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) { current = currentNode.getUndefined(); } //if ignoreUndefined=true the miss is ignored and the score computed anyway @@ -54,7 +58,8 @@ public class TreeProcessor { } - return MatchType.parse(current); + treeStats.setResult(MatchType.parse(current)); + return treeStats; } public double computeScore(final MapDocument doc1, final MapDocument doc2) { @@ -72,7 +77,7 @@ public class TreeProcessor { score = stats.getFinalScore(currentNode.getAggregation()); //if ignoreUndefined=false the miss is considered as undefined - if (!currentNode.isIgnoreUndefined() && stats.getUndefinedCount()>0) { + if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) { current = currentNode.getUndefined(); } //if ignoreUndefined=true the miss is ignored and the score computed anyway diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java new file mode 100644 index 000000000..36188e3c3 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java @@ -0,0 +1,51 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.util.PaceException; +import org.codehaus.jackson.map.ObjectMapper; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class TreeStats { + + //> + Map stats; + MatchType result; + + public TreeStats(){ + this.stats = new HashMap<>(); + this.result = MatchType.NO_MATCH; + } + + public MatchType getResult(){ + return this.result; + } + + public void setResult(MatchType result){ + this.result = result; + } + + public Map getStats() { + return stats; + } + + public void setStats(Map stats) { + this.stats = stats; + } + + public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){ + this.stats.put(layerID, treeNodeStats); + } + + @Override + public String toString(){ + try { + return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } + + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 4247d9be6..b7722a1ed 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.comparators; import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.tree.CityMatch; import eu.dnetlib.pace.tree.JaroWinklerNormalizedName; import eu.dnetlib.pace.config.DedupConfig; import org.junit.Before; @@ -129,4 +130,24 @@ public class ComparatorTest extends AbstractPaceFunctions { System.out.println("result = " + result); } + + @Test + public void cityMatchTest() { + final CityMatch cityMatch = new CityMatch(params); + + //both names with no cities + assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf)); + + //one of the two names with no cities + assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf)); + + //both names with cities (same) + assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf)); + + //both names with cities (different) + assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf)); + + } + + } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java new file mode 100644 index 000000000..e94afff9f --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -0,0 +1,23 @@ +package eu.dnetlib.pace.util; + +import org.junit.Before; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; + +public class UtilTest { + + Map params; + + @Before + public void setUp(){ + params = new HashMap(); + } + + @Test + public void paceResolverTest() { + PaceResolver paceResolver = new PaceResolver(); + paceResolver.getComparator("keywordMatch", params); + } +}