From 8c0d346005c87c231c7982f1be095d5c5c1958a3 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 21 Nov 2019 09:37:56 +0100 Subject: [PATCH] the param map has been updated: now it accepts string parameters --- .../eu/dnetlib/pace/tree/AlwaysMatch.java | 2 +- .../java/eu/dnetlib/pace/tree/CityMatch.java | 10 +- .../eu/dnetlib/pace/tree/ContainsMatch.java | 57 +++++++ .../eu/dnetlib/pace/tree/DoiExactMatch.java | 3 +- .../dnetlib/pace/tree/DomainExactMatch.java | 2 +- .../java/eu/dnetlib/pace/tree/ExactMatch.java | 2 +- .../pace/tree/ExactMatchIgnoreCase.java | 2 +- .../eu/dnetlib/pace/tree/JaroWinkler.java | 2 +- .../pace/tree/JaroWinklerNormalizedName.java | 12 +- .../dnetlib/pace/tree/JaroWinklerTitle.java | 2 +- .../eu/dnetlib/pace/tree/KeywordMatch.java | 10 +- .../dnetlib/pace/tree/Level2JaroWinkler.java | 2 +- .../pace/tree/Level2JaroWinklerTitle.java | 2 +- .../dnetlib/pace/tree/Level2Levenstein.java | 2 +- .../java/eu/dnetlib/pace/tree/Levenstein.java | 2 +- .../eu/dnetlib/pace/tree/LevensteinTitle.java | 2 +- .../tree/LevensteinTitleIgnoreVersion.java | 2 +- .../eu/dnetlib/pace/tree/MustBeDifferent.java | 2 +- .../dnetlib/pace/tree/NullDistanceAlgo.java | 2 +- .../java/eu/dnetlib/pace/tree/PidMatch.java | 6 +- .../java/eu/dnetlib/pace/tree/SizeMatch.java | 2 +- .../dnetlib/pace/tree/SortedJaroWinkler.java | 2 +- .../pace/tree/SortedLevel2JaroWinkler.java | 2 +- .../pace/tree/SubStringLevenstein.java | 4 +- .../dnetlib/pace/tree/TitleVersionMatch.java | 2 +- .../java/eu/dnetlib/pace/tree/UrlMatcher.java | 12 +- .../java/eu/dnetlib/pace/tree/YearMatch.java | 2 +- .../pace/tree/support/AbstractComparator.java | 6 +- .../support/AbstractSortedComparator.java | 4 +- .../dnetlib/pace/tree/support/FieldConf.java | 8 +- .../pace/tree/support/TreeNodeDef.java | 2 +- .../pace/tree/support/TreeProcessor.java | 1 - .../eu/dnetlib/pace/util/PaceResolver.java | 2 +- .../pace/comparators/ComparatorTest.java | 151 ++++++------------ .../eu/dnetlib/pace/config/ConfigTest.java | 1 - .../java/eu/dnetlib/pace/util/UtilTest.java | 4 +- 36 files changed, 169 insertions(+), 162 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ContainsMatch.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java index d5a33ea87..2fb8eb97c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java @@ -11,7 +11,7 @@ import java.util.Map; @ComparatorClass("alwaysMatch") public class AlwaysMatch extends AbstractComparator { - public AlwaysMatch(final Map params){ + public AlwaysMatch(final Map params){ super(params, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java index e56066814..998a52650 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java @@ -10,9 +10,9 @@ import java.util.Set; @ComparatorClass("cityMatch") public class CityMatch extends AbstractComparator { - private Map params; + private Map params; - public CityMatch(Map params) { + public CityMatch(Map params) { super(params); this.params = params; } @@ -29,8 +29,8 @@ public class CityMatch extends AbstractComparator { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); - Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); + Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set codes1 = citiesToCodes(cities1); Set codes2 = citiesToCodes(cities2); @@ -41,7 +41,7 @@ public class CityMatch extends AbstractComparator { else { if (codes1.isEmpty() ^ codes2.isEmpty()) return -1; //undefined if one of the two has no cities - return commonElementsPercentage(codes1, codes2) > params.getOrDefault("threshold", 0).intValue() ? 1.0 : 0.0; + return commonElementsPercentage(codes1, codes2) > Integer.parseInt(params.getOrDefault("threshold", "0")) ? 1.0 : 0.0; } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ContainsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ContainsMatch.java new file mode 100644 index 000000000..8b8a342cd --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ContainsMatch.java @@ -0,0 +1,57 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * The Class Contains match + * + * @author miconis + * */ +@ComparatorClass("containsMatch") +public class ContainsMatch extends AbstractComparator { + + private Map params; + + public ContainsMatch(Map params) { + super(params); + this.params = params; + } + + @Override + public double distance(final String a, final String b, final Config conf) { + + //read parameters + boolean caseSensitive = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); + String string = params.get("string"); + String agg = params.get("bool"); + + String ca = a; + String cb = b; + if (!caseSensitive) { + ca = a.toLowerCase(); + cb = b.toLowerCase(); + } + + switch(agg) { + case "AND": + if(ca.contains(string) && cb.contains(string)) + return 1.0; + break; + case "OR": + if(ca.contains(string) || cb.contains(string)) + return 1.0; + break; + case "XOR": + if(ca.contains(string) ^ cb.contains(string)) + return 1.0; + break; + default: + return 0.0; + } + return 0.0; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java index d3ada9a47..24a94c8cd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java @@ -1,6 +1,5 @@ package eu.dnetlib.pace.tree; -import java.util.List; import java.util.Map; import eu.dnetlib.pace.model.Field; @@ -16,7 +15,7 @@ public class DoiExactMatch extends ExactMatchIgnoreCase { public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; - public DoiExactMatch(final Map params) { + public DoiExactMatch(final Map params) { super(params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java index 4132e1eba..e2eb0cd71 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java @@ -10,7 +10,7 @@ import java.util.Map; @ComparatorClass("domainExactMatch") public class DomainExactMatch extends ExactMatchIgnoreCase { - public DomainExactMatch(final Map params) { + public DomainExactMatch(final Map params) { super(params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java index 11f628d99..f8b7b7489 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -10,7 +10,7 @@ import java.util.Map; @ComparatorClass("exactMatch") public class ExactMatch extends AbstractComparator { - public ExactMatch(Map params){ + public ExactMatch(Map params){ super(params, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java index 4cfe048ca..89cd2719a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java @@ -10,7 +10,7 @@ import java.util.Map; @ComparatorClass("exactMatchIgnoreCase") public class ExactMatchIgnoreCase extends AbstractComparator { - public ExactMatchIgnoreCase(Map params) { + public ExactMatchIgnoreCase(Map params) { super(params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java index 0af0a8053..9e214f6a4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java @@ -11,7 +11,7 @@ import java.util.Map; @ComparatorClass("jaroWinkler") public class JaroWinkler extends AbstractComparator { - public JaroWinkler(Map params){ + public JaroWinkler(Map params){ super(params, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index 230bce8aa..b89cffaed 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -13,9 +13,9 @@ import java.util.Set; @ComparatorClass("jaroWinklerNormalizedName") public class JaroWinklerNormalizedName extends AbstractComparator { - private Map params; + private Map params; - public JaroWinklerNormalizedName(Map params){ + public JaroWinklerNormalizedName(Map params){ super(params, new com.wcohen.ss.JaroWinkler()); this.params = params; } @@ -39,11 +39,11 @@ public class JaroWinklerNormalizedName extends AbstractComparator { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); - Set keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); + Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); - Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); + Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); ca = removeKeywords(ca, keywords1); ca = removeKeywords(ca, cities1); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java index 8556eae4a..38ed437de 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java @@ -12,7 +12,7 @@ import java.util.Map; @ComparatorClass("jaroWinklerTitle") public class JaroWinklerTitle extends AbstractComparator { - public JaroWinklerTitle(Map params){ + public JaroWinklerTitle(Map params){ super(params, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java index ee51acc9b..476c39008 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java @@ -10,9 +10,9 @@ import java.util.Set; @ComparatorClass("keywordMatch") public class KeywordMatch extends AbstractComparator { - Map params; + Map params; - public KeywordMatch(Map params) { + public KeywordMatch(Map params) { super(params); this.params = params; } @@ -29,8 +29,8 @@ public class KeywordMatch extends AbstractComparator { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); - Set keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); + Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set codes1 = toCodes(keywords1, conf.translationMap()); Set codes2 = toCodes(keywords2, conf.translationMap()); @@ -41,7 +41,7 @@ public class KeywordMatch extends AbstractComparator { else { if (codes1.isEmpty() ^ codes2.isEmpty()) return -1; //undefined if one of the two has no keywords - return commonElementsPercentage(codes1, codes2) > params.getOrDefault("threshold", 0).intValue() ? 1.0 : 0.0; + return commonElementsPercentage(codes1, codes2) > Double.parseDouble(params.getOrDefault("threshold", "0.0")) ? 1.0 : 0.0; } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java index 7d6e33fc2..f9e01356d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java @@ -9,7 +9,7 @@ import java.util.Map; @ComparatorClass("level2JaroWinkler") public class Level2JaroWinkler extends AbstractComparator { - public Level2JaroWinkler(Map params){ + public Level2JaroWinkler(Map params){ super(params, new com.wcohen.ss.Level2JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java index 0a9ffdf16..29f99e4ae 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -10,7 +10,7 @@ import java.util.Map; @ComparatorClass("level2JaroWinklerTitle") public class Level2JaroWinklerTitle extends AbstractComparator { - public Level2JaroWinklerTitle(Map params){ + public Level2JaroWinklerTitle(Map params){ super(params, new com.wcohen.ss.Level2JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java index 9d80d7028..73fb13c13 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java @@ -9,7 +9,7 @@ import java.util.Map; @ComparatorClass("level2Levenstein") public class Level2Levenstein extends AbstractComparator { - public Level2Levenstein(Map params){ + public Level2Levenstein(Map params){ super(params, new com.wcohen.ss.Level2Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java index d8706c911..c146e5ab1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java @@ -9,7 +9,7 @@ import java.util.Map; @ComparatorClass("levenstein") public class Levenstein extends AbstractComparator { - public Levenstein(Map params){ + public Levenstein(Map params){ super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java index e5edacfee..fda5848df 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -15,7 +15,7 @@ public class LevensteinTitle extends AbstractComparator { private static final Log log = LogFactory.getLog(LevensteinTitle.class); - public LevensteinTitle(Map params){ + public LevensteinTitle(Map params){ super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java index 89f3749b5..506760fa0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -14,7 +14,7 @@ import java.util.Map; @ComparatorClass("levensteinTitleIgnoreVersion") public class LevensteinTitleIgnoreVersion extends AbstractComparator { - public LevensteinTitleIgnoreVersion(Map params){ + public LevensteinTitleIgnoreVersion(Map params){ super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java index 01d77b30c..64413dfff 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java @@ -10,7 +10,7 @@ import java.util.Map; @ComparatorClass("mustBeDifferent") public class MustBeDifferent extends AbstractComparator { - public MustBeDifferent(Map params){ + public MustBeDifferent(Map params){ super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java index e65ac71f7..98c2f4548 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java @@ -14,7 +14,7 @@ import java.util.Map; @ComparatorClass("null") public class NullDistanceAlgo implements Comparator { - public NullDistanceAlgo(Map params){ + public NullDistanceAlgo(Map params){ } @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java index 14845daf4..0632e8bf9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java @@ -20,9 +20,9 @@ import java.util.stream.Collectors; public class PidMatch extends AbstractComparator { private static final Log log = LogFactory.getLog(PidMatch.class); - private Map params; + private Map params; - public PidMatch(final Map params) { + public PidMatch(final Map params) { super(params); this.params = params; } @@ -50,7 +50,7 @@ public class PidMatch extends AbstractComparator { return 0.0; } - return (double)incommon / (incommon + simDiff) > params.getOrDefault("threshold", 0.5).doubleValue() ? 1 : 0; + return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java index 91f1e356a..175b0666d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java @@ -24,7 +24,7 @@ public class SizeMatch extends AbstractComparator { * @param params * the parameters */ - public SizeMatch(final Map params) { + public SizeMatch(final Map params) { super(params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java index 4b2eb83f1..79173ba66 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java @@ -12,7 +12,7 @@ import java.util.Map; @ComparatorClass("sortedJaroWinkler") public class SortedJaroWinkler extends AbstractSortedComparator { - public SortedJaroWinkler(Map params){ + public SortedJaroWinkler(Map params){ super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java index f80a268e2..de8c669d7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java @@ -22,7 +22,7 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator { super(weight, new com.wcohen.ss.Level2JaroWinkler()); } - public SortedLevel2JaroWinkler(final Map params){ + public SortedLevel2JaroWinkler(final Map params){ super(params, new com.wcohen.ss.Level2JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java index 004fc90e2..f76947930 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java @@ -31,9 +31,9 @@ public class SubStringLevenstein extends AbstractComparator { super(w, new com.wcohen.ss.Levenstein()); } - public SubStringLevenstein(Map params){ + public SubStringLevenstein(Map params){ super(params, new com.wcohen.ss.Levenstein()); - this.limit = params.get("limit").intValue(); + this.limit = Integer.parseInt(params.getOrDefault("limit", "1")); } /** diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java index fd86b1730..873a0c100 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java @@ -17,7 +17,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("titleVersionMatch") public class TitleVersionMatch extends AbstractComparator { - public TitleVersionMatch(final Map params) { + public TitleVersionMatch(final Map params) { super(params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java index 8f36126c8..34bbab7bf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java @@ -12,19 +12,19 @@ import java.util.Map; @ComparatorClass("urlMatcher") public class UrlMatcher extends Levenstein { - private Map params; + private Map params; - public UrlMatcher(Map params){ + public UrlMatcher(Map params){ super(params); this.params = params; } - public UrlMatcher(double weight, Map params) { + public UrlMatcher(double weight, Map params) { super(weight); this.params = params; } - public void setParams(Map params) { + public void setParams(Map params) { this.params = params; } @@ -37,8 +37,8 @@ public class UrlMatcher extends Levenstein { return 0.0; } - Double hostW = params.get("host").doubleValue(); - Double pathW = params.get("path").doubleValue(); + Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5")); + Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5")); if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { return hostW * 0.5; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java index f57cba531..64bd75b0c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java @@ -18,7 +18,7 @@ public class YearMatch extends AbstractComparator { private int limit = 4; - public YearMatch(final Map params) { + public YearMatch(final Map params) { super(params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java index 1ed14b067..ddfcc5565 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -18,13 +18,13 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement /** The weight. */ protected double weight = 0.0; - private Map params; + private Map params; - protected AbstractComparator(Map params) { + protected AbstractComparator(Map params) { this.params = params; } - protected AbstractComparator(Map params, final AbstractStringDistance ssalgo){ + protected AbstractComparator(Map params, final AbstractStringDistance ssalgo){ this.params = params; this.weight = 1.0; this.ssalgo = ssalgo; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java index 18b5e67c6..557ad0c29 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java @@ -23,8 +23,8 @@ public abstract class AbstractSortedComparator extends AbstractComparator { super(weight, ssalgo); } - protected AbstractSortedComparator(final Map params, final AbstractStringDistance ssalgo){ - super(params.get("weight").doubleValue(), ssalgo); + protected AbstractSortedComparator(final Map params, final AbstractStringDistance ssalgo){ + super(Double.parseDouble(params.get("weight")), ssalgo); } @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index b25d2a03c..0d08fdd03 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -12,7 +12,7 @@ public class FieldConf implements Serializable { private String field; //name of the field on which apply the comparator private String comparator; //comparator name private double weight = 1.0; //weight for the field (to be used in the aggregation) - private Map params; //parameters + private Map params; //parameters private boolean countIfUndefined; @@ -27,7 +27,7 @@ public class FieldConf implements Serializable { public FieldConf() { } - public FieldConf(String field, String comparator, double weight, Map params, boolean countIfUndefined) { + public FieldConf(String field, String comparator, double weight, Map params, boolean countIfUndefined) { this.field = field; this.comparator = comparator; this.weight = weight; @@ -59,11 +59,11 @@ public class FieldConf implements Serializable { this.weight = weight; } - public Map getParams() { + public Map getParams() { return params; } - public void setParams(Map params) { + public void setParams(Map params) { this.params = params; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 3f9bdce1e..f3d37c724 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -46,7 +46,7 @@ public class TreeNodeDef implements Serializable { double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); - stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField(), new FieldStats(weight, result, fieldConf.isCountIfUndefined())); + stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined())); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index 23c9a3ea8..b9af3594a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -23,7 +23,6 @@ public class TreeProcessor{ } public boolean compare(final MapDocument a, final MapDocument b) { - //evaluate the decision tree return evaluateTree(a, b).getResult() == MatchType.MATCH; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index 8c670a85c..bf6feea1c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -38,7 +38,7 @@ public class PaceResolver implements Serializable { } } - public Comparator getComparator(String name, Map params) throws PaceException { + public Comparator getComparator(String name, Map params) throws PaceException { try { return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index b7722a1ed..920d00eeb 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -2,9 +2,12 @@ package eu.dnetlib.pace.comparators; import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.tree.CityMatch; +import eu.dnetlib.pace.tree.ContainsMatch; import eu.dnetlib.pace.tree.JaroWinklerNormalizedName; import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.tree.KeywordMatch; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import eu.dnetlib.pace.common.AbstractPaceFunctions; @@ -17,13 +20,13 @@ import static junit.framework.Assert.assertTrue; public class ComparatorTest extends AbstractPaceFunctions { - private Map params; + private Map params; private DedupConfig conf; @Before public void setup() { params = new HashMap<>(); - params.put("weight", 1.0); + params.put("weight", "1.0"); conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class)); } @@ -34,103 +37,6 @@ public class ComparatorTest extends AbstractPaceFunctions { System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa")); } - @Test - public void testJaroWinklerNormalizedName() { - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf); - - System.out.println("result = " + result); - assertEquals(0.0, result); - } - - @Test - public void testJaroWinklerNormalizedName2() { - - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf); - - assertEquals(1.0, result); - } - - @Test - public void testJaroWinklerNormalizedName3() { - - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf); - - System.out.println("result = " + result); - assertEquals(0.0, result); - } - - @Test - public void testJaroWinklerNormalizedName4() { - - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf); - - System.out.println("result = " + result); - assertEquals(1.0, result); - } - - @Test - public void testJaroWinklerNormalizedName5() { - - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf); - - System.out.println("result = " + result); - assertEquals(1.0, result); - } - - @Test - public void testJaroWinklerNormalizedName6() { - - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf); - - System.out.println("result = " + result); - assertTrue(result > 0.9); - - } - - @Test - public void testJaroWinklerNormalizedName7() { - - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf); - - System.out.println("result = " + result); - assertTrue(result > 0.9); - } - - @Test - public void testJaroWinklerNormalizedName8() { - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - - double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf); - - System.out.println("result = " + result); - } - - @Test - public void testJaroWinklerNormalizedName9() { - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - - double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf); - - System.out.println("result = " + result); - } - - @Test - public void testJaroWinklerNormalizedName10(){ - - final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - - double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf); - - System.out.println("result = " + result); - } - @Test public void cityMatchTest() { final CityMatch cityMatch = new CityMatch(params); @@ -147,6 +53,53 @@ public class ComparatorTest extends AbstractPaceFunctions { //both names with cities (different) assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf)); + //particular cases + assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); + assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf)); + } + + // @Test +// public void testJaroWinklerNormalizedName6() { +// +// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); +// double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf); +// +// System.out.println("result = " + result); +// assertTrue(result > 0.9); +// +// } +// @Test +// public void testJaroWinklerNormalizedName10(){ +// +// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); +// +// double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf); +// +// System.out.println("result = " + result); +// } + + @Test + public void keywordMatchTest(){ + params.put("threshold", "0.4"); + + final KeywordMatch keywordMatch = new KeywordMatch(params); + + assertEquals(1.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); + assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf)); + assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf)); + assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); + } + + @Test + public void containsMatchTest(){ + + params.put("string", "openorgs"); + params.put("bool", "XOR"); + params.put("caseSensitive", "false"); + + final ContainsMatch containsMatch = new ContainsMatch(params); + + assertEquals(0.0, containsMatch.distance("openorgs", "openorgs", conf)); } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 33ef542da..3b87cedad 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -7,7 +7,6 @@ import java.util.Map; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; public class ConfigTest extends AbstractPaceTest { diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index e94afff9f..36aca3346 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -8,11 +8,11 @@ import java.util.Map; public class UtilTest { - Map params; + Map params; @Before public void setUp(){ - params = new HashMap(); + params = new HashMap(); } @Test