From a5c5d2f01bf403176d941f444c334fff36fcb84f Mon Sep 17 00:00:00 2001 From: miconis Date: Fri, 9 Aug 2019 10:08:34 +0200 Subject: [PATCH] implementation of the decision tree. It takes place of the distance algos, necessaryConditions and sufficientConditions are still there. The model contains only path, type and name of the field. ignoreMissing is still in the model because it is used by the conditions. --- .../pace/condition/AbstractCondition.java | 2 +- .../dnetlib/pace/condition/ConditionAlgo.java | 2 +- .../java/eu/dnetlib/pace/config/Config.java | 12 +- .../eu/dnetlib/pace/config/DedupConfig.java | 10 +- .../eu/dnetlib/pace/config/PaceConfig.java | 36 ++-- .../pace/distance/AbstractDistance.java | 30 ++-- .../eu/dnetlib/pace/distance/Distance.java | 3 +- .../dnetlib/pace/distance/DistanceAlgo.java | 2 +- .../dnetlib/pace/distance/DistanceScorer.java | 126 -------------- .../pace/distance/PaceDocumentDistance.java | 24 +-- .../pace/distance/PairwiseComparison.java | 125 ++++++++++++++ .../distance/SecondStringDistanceAlgo.java | 6 +- .../pace/distance/algo/LevensteinTitle.java | 1 - .../algo/LevensteinTitleIgnoreVersion.java | 2 +- .../pace/distance/algo/NullDistanceAlgo.java | 2 +- .../pace/distance/algo/SortedJaroWinkler.java | 4 +- .../algo/SortedLevel2JaroWinkler.java | 4 +- .../algo/SortedSecondStringDistanceAlgo.java | 6 +- .../distance/algo/SubStringLevenstein.java | 6 +- .../pace/distance/eval/DistanceEvalMap.java | 32 ---- .../pace/distance/eval/ScoreResult.java | 62 ------- .../java/eu/dnetlib/pace/model/FieldDef.java | 53 ++---- .../eu/dnetlib/pace/tree/AlwaysMatch.java | 42 +++++ .../java/eu/dnetlib/pace/tree/ExactMatch.java | 38 +++++ .../eu/dnetlib/pace/tree/JaroWinkler.java | 46 +++++ .../pace/tree/JaroWinklerNormalizedName.java | 78 +++++++++ .../dnetlib/pace/tree/JaroWinklerTitle.java | 46 +++++ .../dnetlib/pace/tree/Level2JaroWinkler.java | 36 ++++ .../pace/tree/Level2JaroWinklerTitle.java | 51 ++++++ .../dnetlib/pace/tree/Level2Levenstein.java | 36 ++++ .../java/eu/dnetlib/pace/tree/Levenstein.java | 36 ++++ .../eu/dnetlib/pace/tree/LevensteinTitle.java | 59 +++++++ .../tree/LevensteinTitleIgnoreVersion.java | 60 +++++++ .../eu/dnetlib/pace/tree/MustBeDifferent.java | 41 +++++ .../dnetlib/pace/tree/NullDistanceAlgo.java | 25 +++ .../dnetlib/pace/tree/SortedJaroWinkler.java | 63 +++++++ .../pace/tree/SortedLevel2JaroWinkler.java | 63 +++++++ .../pace/tree/SubStringLevenstein.java | 99 +++++++++++ .../java/eu/dnetlib/pace/tree/UrlMatcher.java | 60 +++++++ .../pace/tree/support/AbstractComparator.java | 110 ++++++++++++ .../support/AbstractSortedComparator.java | 38 +++++ .../eu/dnetlib/pace/tree/support/AggType.java | 22 +++ .../dnetlib/pace/tree/support/Comparator.java | 9 + .../pace/tree/support/ComparatorClass.java | 13 ++ .../{model => tree/support}/FieldConf.java | 15 +- .../dnetlib/pace/tree/support/MatchType.java | 18 ++ .../pace/tree/support/TreeNodeDef.java | 157 ++++++++++++++++++ .../eu/dnetlib/pace/util/BlockProcessor.java | 33 ++-- .../eu/dnetlib/pace/util/DiffPatchMatch.java | 4 +- .../eu/dnetlib/pace/util/PaceResolver.java | 18 +- .../eu/dnetlib/pace/config/dedupConfig.st | 4 +- .../eu/dnetlib/pace/config/org.curr.conf | 4 +- 52 files changed, 1517 insertions(+), 357 deletions(-) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java rename dnet-pace-core/src/main/java/eu/dnetlib/pace/{model => tree/support}/FieldConf.java (81%) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java index 2b4aa29a1..5c7b4d130 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java @@ -9,7 +9,7 @@ import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; /** - * Abstract conditions needs a list of field names. + * Abstract necessaryConditions needs a list of field names. * * @author claudio * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java index 787ad9af1..1ea9caa16 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java @@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.FieldDef; /** - * Allows to express general conditions to be satisfied or not between two Documents. + * Allows to express general necessaryConditions to be satisfied or not between two Documents. * * @author claudio */ diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 7498c23cf..1a12e699f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -6,6 +6,7 @@ import java.util.Map; import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.tree.support.TreeNodeDef; /** * Interface for PACE configuration bean. @@ -21,6 +22,9 @@ public interface Config { */ public List model(); + + public Map decisionTree(); + /** * Field configuration definitions. * @@ -31,16 +35,16 @@ public interface Config { /** * Strict Pre-Condition definitions. * - * @return the list of conditions + * @return the list of necessaryConditions */ - public List strictConditions(); + public List sufficientConditions(); /** * Pre-Condition definitions. * - * @return the list of conditions + * @return the list of necessaryConditions */ - public List conditions(); + public List necessaryConditions(); /** * Clusterings. diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index 1cfcb089c..805f5c38e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -8,6 +8,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.function.BiFunction; +import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceException; import org.antlr.stringtemplate.StringTemplate; import org.apache.commons.io.IOUtils; @@ -114,6 +115,11 @@ public class DedupConfig implements Config, Serializable { } } + @Override + public Map decisionTree(){ + return getPace().getDecisionTree(); + } + @Override public List model() { return getPace().getModel(); @@ -125,12 +131,12 @@ public class DedupConfig implements Config, Serializable { } @Override - public List strictConditions() { + public List sufficientConditions() { return getPace().getStrictConditionAlgos(); } @Override - public List conditions() { + public List necessaryConditions() { return getPace().getConditionAlgos(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 4fde1dee9..d90cfe381 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -6,6 +6,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceResolver; import org.apache.commons.collections.CollectionUtils; import org.codehaus.jackson.annotate.JsonIgnore; @@ -18,9 +19,12 @@ import java.util.stream.Collectors; public class PaceConfig implements Serializable { private List model; - private List strictConditions; - private List conditions; + + private List sufficientConditions; + private List necessaryConditions; private List clustering; + private Map decisionTree; + private Map> blacklists; @JsonIgnore @@ -46,30 +50,30 @@ public class PaceConfig implements Serializable { this.model = model; } - public List getStrictConditions() { - return strictConditions; + public List getSufficientConditions() { + return sufficientConditions; } - public void setStrictConditions(final List strictConditions) { - this.strictConditions = strictConditions; + public void setSufficientConditions(final List sufficientConditions) { + this.sufficientConditions = sufficientConditions; } - public List getConditions() { - return conditions; + public List getNecessaryConditions() { + return necessaryConditions; } @JsonIgnore public List getConditionAlgos() { - return asConditionAlgos(getConditions()); + return asConditionAlgos(getNecessaryConditions()); } @JsonIgnore public List getStrictConditionAlgos() { - return asConditionAlgos(getStrictConditions()); + return asConditionAlgos(getSufficientConditions()); } - public void setConditions(final List conditions) { - this.conditions = conditions; + public void setNecessaryConditions(final List necessaryConditions) { + this.necessaryConditions = necessaryConditions; } public List getClustering() { @@ -80,6 +84,14 @@ public class PaceConfig implements Serializable { this.clustering = clustering; } + public Map getDecisionTree() { + return decisionTree; + } + + public void setDecisionTree(Map decisionTree) { + this.decisionTree = decisionTree; + } + public Map> getBlacklists() { return blacklists; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java index f9d189ff6..3304f3638 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java @@ -1,15 +1,15 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.eval.ScoreResult; -import eu.dnetlib.pace.model.Document; - -public abstract class AbstractDistance implements Distance { - - protected abstract Document toDocument(A a); - - @Override - public ScoreResult between(final A a, final A b, final Config config) { - return new DistanceScorer(config).distance(toDocument(a), toDocument(b)); - } -} +//package eu.dnetlib.pace.distance; +// +//import eu.dnetlib.pace.config.Config; +//import eu.dnetlib.pace.distance.eval.ScoreResult; +//import eu.dnetlib.pace.model.Document; +// +//public abstract class AbstractDistance implements Distance { +// +// protected abstract Document toDocument(A a); +// +// @Override +// public boolean between(final A a, final A b, final Config config) { +// return new PairwiseComparison(config).compare(toDocument(a), toDocument(b)); +// } +//} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java index 93a6e757a..b812bd1d9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java @@ -1,9 +1,8 @@ package eu.dnetlib.pace.distance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.eval.ScoreResult; public interface Distance { - public ScoreResult between(A a, A b, Config config); + public boolean between(A a, A b, Config config); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java index 5e4f69f51..80b2191cc 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java @@ -5,7 +5,7 @@ import eu.dnetlib.pace.model.Field; import java.util.Map; /** - * Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two + * Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two * objects. */ public interface DistanceAlgo { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java deleted file mode 100644 index bb3c37ed6..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java +++ /dev/null @@ -1,126 +0,0 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.condition.ConditionAlgo; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.eval.ConditionEvalMap; -import eu.dnetlib.pace.distance.eval.DistanceEval; -import eu.dnetlib.pace.distance.eval.DistanceEvalMap; -import eu.dnetlib.pace.distance.eval.ScoreResult; -import eu.dnetlib.pace.model.*; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import java.util.Collection; -import java.util.List; -import java.util.stream.Collectors; - -/** - * The distance between two documents is given by the weighted mean of the field distances - */ -public class DistanceScorer { - - private static final Log log = LogFactory.getLog(DistanceScorer.class); - - private Config config; - - public DistanceScorer(final Config config) { - this.config = config; - } - - public ScoreResult distance(final Document a, final Document b) { - final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison - - sr.setStrictConditions(verify(a, b, config.strictConditions())); - sr.setConditions(verify(a, b, config.conditions())); - - final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model())); - - for (final FieldDef fd : config.model()) { - - dMap.updateDistance(fieldDistance(a, b, fd)); - } - sr.setDistances(dMap); - return sr; - } - - private ConditionEvalMap verify(final Document a, final Document b, final List conditions) { - final ConditionEvalMap res = new ConditionEvalMap(); - - for (final ConditionAlgo cd : conditions) { - final ConditionEvalMap map = cd.verify(a, b); - res.mergeFrom(map); - - // commented out shortcuts - /* - if (map.anyNegative()) { - return res; - } - */ - - //if (strict && (res < 0)) return -1; - //cond += verify; - } - return res; - } - - private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) { - - final double w = fd.getWeight(); - final Field va = getValue(a, fd); - final Field vb = getValue(b, fd); - - final DistanceEval de = new DistanceEval(fd, va, vb); - if ((w == 0)) return de; // optimization for 0 weight - else { - if (va.isEmpty() || vb.isEmpty()) { - if (fd.isIgnoreMissing()) { - de.setDistance(-1); - } else { - de.setDistance(w); - } - } else { - if (va.getType().equals(vb.getType())) { - de.setDistance(w * fd.distanceAlgo().distance(va, vb)); - } else { - throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType())); - } - } - return de; - } - } - - private Field getValue(final Document d, final FieldDef fd) { - final Field v = d.values(fd.getName()); - if (fd.getLength() > 0) { - - if (v instanceof FieldValueImpl) { - ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength())); - } else if (v instanceof FieldListImpl) { - List strings = ((FieldListImpl) v).stringList(); - strings = strings.stream() - .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) - .map(s -> StringUtils.substring(s, 0, fd.getLength())) - .collect(Collectors.toList()); - ((FieldListImpl) v).clear(); - ((FieldListImpl) v).addAll(strings.stream() - .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) - .map(s -> StringUtils.substring(s, 0, fd.getLength())) - .map(s -> new FieldValueImpl(v.getType(), v.getName(), s)) - .collect(Collectors.toList())); - } - } - - return v; - } - - private double sumWeights(final Collection fields) { - double sum = 0.0; - for (final FieldDef fd : fields) { - sum += fd.getWeight(); - } - return sum; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java index 7651479ee..9c75bfcf8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java @@ -1,12 +1,12 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.model.Document; - -public class PaceDocumentDistance extends AbstractDistance { - - @Override - protected Document toDocument(Document a) { - return a; - } - -} +//package eu.dnetlib.pace.distance; +// +//import eu.dnetlib.pace.model.Document; +// +//public class PaceDocumentDistance extends AbstractDistance { +// +// @Override +// protected Document toDocument(Document a) { +// return a; +// } +// +//} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java new file mode 100644 index 000000000..125919d0f --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java @@ -0,0 +1,125 @@ +package eu.dnetlib.pace.distance; + +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.distance.eval.ConditionEvalMap; +import eu.dnetlib.pace.model.*; +import eu.dnetlib.pace.tree.support.MatchType; +import eu.dnetlib.pace.tree.support.TreeNodeDef; +import eu.dnetlib.pace.util.PaceException; +import eu.dnetlib.pace.util.Reporter; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.List; +import java.util.Map; + +/** + * The compare between two documents is given by the weighted mean of the field distances + */ +public class PairwiseComparison { + + private static final Log log = LogFactory.getLog(PairwiseComparison.class); + + private Config config; + + public PairwiseComparison(final Config config) { + this.config = config; + } + + public boolean compare(final MapDocument a, final MapDocument b) { + + //verify sufficientConditions + if (verify(a, b, config.sufficientConditions()).result() > 0) + return true; + + //verify necessaryConditions + if (verify(a, b, config.necessaryConditions()).result() < 0) + return false; + + //evaluate the decision tree + return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH; + } + + private ConditionEvalMap verify(final Document a, final Document b, final List conditions) { + final ConditionEvalMap res = new ConditionEvalMap(); + + for (final ConditionAlgo cd : conditions) { + final ConditionEvalMap map = cd.verify(a, b); + res.mergeFrom(map); + + // commented out shortcuts + /* + if (map.anyNegative()) { + return res; + } + */ + + //if (strict && (res < 0)) return -1; + //cond += verify; + } + return res; + } + + public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map decisionTree){ + + String current = "start"; + double similarity; + + while (MatchType.parse(current)==MatchType.UNDEFINED) { + + TreeNodeDef currentNode = decisionTree.get(current); + //throw an exception if the node doesn't exist + if (currentNode == null) + throw new PaceException("The Tree Node doesn't exist: " + current); + + similarity = currentNode.evaluate(doc1, doc2); + + if (similarity == -1) { + current = currentNode.getUndefined(); + } + else if (similarity>=currentNode.getThreshold()){ + current = currentNode.getPositive(); + } + else { + current = currentNode.getNegative(); + } + + } + + return MatchType.parse(current); + } + +// private Field getValue(final Document d, final FieldDef fd) { +// final Field v = d.values(fd.getName()); +// if (fd.getLength() > 0) { +// +// if (v instanceof FieldValueImpl) { +// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength())); +// } else if (v instanceof FieldListImpl) { +// List strings = ((FieldListImpl) v).stringList(); +// strings = strings.stream() +// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) +// .map(s -> StringUtils.substring(s, 0, fd.getLength())) +// .collect(Collectors.toList()); +// ((FieldListImpl) v).clear(); +// ((FieldListImpl) v).addAll(strings.stream() +// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) +// .map(s -> StringUtils.substring(s, 0, fd.getLength())) +// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s)) +// .collect(Collectors.toList())); +// } +// } +// +// return v; +// } +// +// private double sumWeights(final Collection fields) { +// double sum = 0.0; +// for (final FieldDef fd : fields) { +// sum += fd.getWeight(); +// } +// return sum; +// } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java index 9cc35298f..d3ccccec5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java @@ -12,7 +12,7 @@ import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; /** - * For the rest of the fields delegate the distance measure to the second string library. + * For the rest of the fields delegate the compare measure to the second string library. */ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo { @@ -35,7 +35,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp } /** - * Instantiates a new second string distance algo. + * Instantiates a new second string compare algo. * * @param weight * the weight @@ -90,7 +90,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + * @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override public double distance(final Field a, final Field b) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java index 503dc33b2..ae0ef9d00 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.DistanceScorer; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java index ff8b34bf3..6303f8e2a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java @@ -7,7 +7,7 @@ import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import java.util.Map; /** - * Compared distance between two titles, ignoring version numbers. Suitable for Software entities. + * Compared compare between two titles, ignoring version numbers. Suitable for Software entities. */ @DistanceClass("LevensteinTitleIgnoreVersion") public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java index 8afc45fd6..c1b1d7223 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java @@ -7,7 +7,7 @@ import eu.dnetlib.pace.model.Field; import java.util.Map; /** - * Not all fields of a document need to partecipate in the distance measure. We model those fields as having a + * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a * NullDistanceAlgo. */ @DistanceClass("Null") diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java index e3175a13e..d3aa58f99 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java @@ -40,7 +40,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override public double getWeight() { @@ -50,7 +50,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override protected double normalize(final double d) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java index 1a12e1688..2523153ed 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java @@ -40,7 +40,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override public double getWeight() { @@ -50,7 +50,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override protected double normalize(final double d) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java index 8a9c51402..f72750521 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java @@ -12,12 +12,12 @@ import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; /** - * For the rest of the fields delegate the distance measure to the second string library. + * For the rest of the fields delegate the compare measure to the second string library. */ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo { /** - * Instantiates a new sorted second string distance algo. + * Instantiates a new sorted second string compare algo. * * @param weight * the weight @@ -35,7 +35,7 @@ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanc /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field) */ @Override protected List toList(final Field list) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java index 8f0c024c7..b788fade4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java @@ -66,7 +66,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override public double distance(final Field a, final Field b) { @@ -79,7 +79,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override public double getWeight() { @@ -89,7 +89,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override protected double normalize(final double d) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java deleted file mode 100644 index 764e60354..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java +++ /dev/null @@ -1,32 +0,0 @@ -package eu.dnetlib.pace.distance.eval; - -import java.util.HashMap; - -/** - * Created by claudio on 10/03/16. - */ -public class DistanceEvalMap extends HashMap { - - private double sumWeights; - - private double sumDistances = 0.0; - - public DistanceEvalMap(final double sumWeights) { - this.sumWeights = sumWeights; - } - - public void updateDistance(final DistanceEval d) { - - put(d.getFieldDef().getName(), d); - if (d.getDistance() >= 0) { - sumDistances += d.getDistance(); - } else { - sumWeights -= d.getFieldDef().getWeight(); - } - } - - public double distance() { - return sumWeights == 0 ? 0 : sumDistances / sumWeights; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java deleted file mode 100644 index 62b7d85b4..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java +++ /dev/null @@ -1,62 +0,0 @@ -package eu.dnetlib.pace.distance.eval; - -import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; - -import java.io.IOException; -import java.io.Serializable; - -/** - * Created by claudio on 09/03/16. - */ -public class ScoreResult implements Serializable { - - private ConditionEvalMap strictConditions; - - private ConditionEvalMap conditions; - - private DistanceEvalMap distances; - - public double getScore() { - - if (getStrictConditions().result() > 0) return 1.0; - // if (getStrictConditions().result() < 0) return 0.0; - if (getConditions().result() < 0) return 0.0; - - return getDistances().distance(); - } - - - public ConditionEvalMap getStrictConditions() { - return strictConditions; - } - - public void setStrictConditions(final ConditionEvalMap strictConditions) { - this.strictConditions = strictConditions; - } - - public ConditionEvalMap getConditions() { - return conditions; - } - - public void setConditions(final ConditionEvalMap conditions) { - this.conditions = conditions; - } - - public DistanceEvalMap getDistances() { - return distances; - } - - public void setDistances(final DistanceEvalMap distances) { - this.distances = distances; - } - - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("unable to serialise " + this.getClass().getName(), e); - } - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index 996471338..c76b63716 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -14,25 +14,29 @@ import java.util.List; import java.util.Map; /** - * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm. + * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm. */ public class FieldDef implements Serializable { public final static String PATH_SEPARATOR = "/"; - private String algo; - private String name; private String path; - private boolean ignoreMissing; - private Type type; - private boolean overrideMatch; + private boolean ignoreMissing; - private double weight; + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; + } + + private boolean overrideMatch; /** * Sets maximum size for the repeatable fields in the model. -1 for unbounded size. @@ -74,20 +78,6 @@ public class FieldDef implements Serializable { return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath())); } - public DistanceAlgo distanceAlgo() { - - if (params == null) { - params = new HashMap<>(); - } - - params.put("weight", getWeight()); - return PaceConfig.resolver.getDistanceAlgo(getAlgo(), params); - } - - public boolean isIgnoreMissing() { - return ignoreMissing; - } - public Type getType() { return type; } @@ -104,23 +94,6 @@ public class FieldDef implements Serializable { this.overrideMatch = overrideMatch; } - public double getWeight() { - return weight; - } - - public void setWeight(final double weight) { - this.weight = weight; - } - - public String getAlgo() { - return algo; - } - - public void setAlgo(final String algo) { - this.algo = algo; - } - - public int getSize() { return size; } @@ -153,10 +126,6 @@ public class FieldDef implements Serializable { this.path = path; } - public void setIgnoreMissing(boolean ignoreMissing) { - this.ignoreMissing = ignoreMissing; - } - @Override public String toString() { return new Gson().toJson(this); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java new file mode 100644 index 000000000..7e4b18788 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java @@ -0,0 +1,42 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("alwaysMatch") +public class AlwaysMatch extends AbstractComparator { + + public AlwaysMatch(final Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public AlwaysMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return 1.0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java new file mode 100644 index 000000000..5a844de60 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -0,0 +1,38 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("exactMatch") +public class ExactMatch extends AbstractComparator { + + public ExactMatch(Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public ExactMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return a.equals(b) ? 1.0 : 0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java new file mode 100644 index 000000000..f2a696b47 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java @@ -0,0 +1,46 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.io.Serializable; +import java.util.Map; + +//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +@ComparatorClass("jaroWinkler") +public class JaroWinkler extends AbstractComparator { + + public JaroWinkler(Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public JaroWinkler(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(String a, String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + return normalize(ssalgo.score(ca, cb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java new file mode 100644 index 000000000..b6b593de7 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -0,0 +1,78 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; +import java.util.Set; + +@ComparatorClass("jaroWinklerNormalizedName") +public class JaroWinklerNormalizedName extends AbstractComparator { + + private Map params; + + public JaroWinklerNormalizedName(Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + this.params = params; + } + + public JaroWinklerNormalizedName(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(String a, String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + ca = normalize(ca); + cb = normalize(cb); + + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); + + Set keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue()); + Set keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue()); + + Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); + Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); + + if (sameCity(cities1,cities2)) { + + if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { + + ca = removeKeywords(ca, keywords1); + ca = removeKeywords(ca, cities1); + cb = removeKeywords(cb, keywords2); + cb = removeKeywords(cb, cities2); + + if (ca.isEmpty() && cb.isEmpty()) + return 1.0; + else + return normalize(ssalgo.score(ca,cb)); + + } + } + + return 0.0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java new file mode 100644 index 000000000..99d7a8668 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java @@ -0,0 +1,46 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +@ComparatorClass("jaroWinklerTitle") +public class JaroWinklerTitle extends AbstractComparator { + + public JaroWinklerTitle(Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public JaroWinklerTitle(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(String a, String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + boolean check = checkNumbers(ca, cb); + return check ? 0.5 : normalize(ssalgo.score(ca, cb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java new file mode 100644 index 000000000..d4d5b8f1e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java @@ -0,0 +1,36 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("level2JaroWinkler") +public class Level2JaroWinkler extends AbstractComparator { + + public Level2JaroWinkler(Map params){ + super(params, new com.wcohen.ss.Level2JaroWinkler()); + } + + public Level2JaroWinkler(double w) { + super(w, new com.wcohen.ss.Level2JaroWinkler()); + } + + protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java new file mode 100644 index 000000000..41a38c1ee --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -0,0 +1,51 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("level2JaroWinklerTitle") +public class Level2JaroWinklerTitle extends AbstractComparator { + + public Level2JaroWinklerTitle(Map params){ + super(params, new com.wcohen.ss.Level2JaroWinkler()); + } + + public Level2JaroWinklerTitle(final double w) { + super(w, new com.wcohen.ss.Level2JaroWinkler()); + } + + protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + final String ca = cleanup(a); + final String cb = cleanup(b); + + final boolean check = checkNumbers(ca, cb); + + if (check) return 0.5; + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return ssalgo.score(cca, ccb); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java new file mode 100644 index 000000000..1a598ecbe --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java @@ -0,0 +1,36 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("level2Levenstein") +public class Level2Levenstein extends AbstractComparator { + + public Level2Levenstein(Map params){ + super(params, new com.wcohen.ss.Level2Levenstein()); + } + + public Level2Levenstein(double w) { + super(w, new com.wcohen.ss.Level2Levenstein()); + } + + protected Level2Levenstein(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java new file mode 100644 index 000000000..ada70fdc0 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java @@ -0,0 +1,36 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("levenstein") +public class Levenstein extends AbstractComparator { + + public Levenstein(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public Levenstein(double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected Levenstein(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java new file mode 100644 index 000000000..45459f4dc --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -0,0 +1,59 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.Map; + +@ComparatorClass("levensteinTitle") +public class LevensteinTitle extends AbstractComparator { + + private static final Log log = LogFactory.getLog(LevensteinTitle.class); + + public LevensteinTitle(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public LevensteinTitle(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + final String ca = cleanup(a); + final String cb = cleanup(b); + + final boolean check = checkNumbers(ca, cb); + + if (check) return 0.5; + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); + } + + private double normalize(final double score, final int la, final int lb) { + return 1 - (Math.abs(score) / Math.max(la, lb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java new file mode 100644 index 000000000..342cee753 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -0,0 +1,60 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * Compared compare between two titles, ignoring version numbers. Suitable for Software entities. + */ +@ComparatorClass("levensteinTitleIgnoreVersion") +public class LevensteinTitleIgnoreVersion extends AbstractComparator { + + public LevensteinTitleIgnoreVersion(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public LevensteinTitleIgnoreVersion(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim(); + cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim(); + + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); + } + + private double normalize(final double score, final int la, final int lb) { + return 1 - (Math.abs(score) / Math.max(la, lb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java new file mode 100644 index 000000000..d7251f18e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java @@ -0,0 +1,41 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("mustBeDifferent") +public class MustBeDifferent extends AbstractComparator { + + public MustBeDifferent(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public MustBeDifferent(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return !a.equals(b) ? 1.0 : 0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java new file mode 100644 index 000000000..7c5c3f4ae --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java @@ -0,0 +1,25 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.distance.DistanceAlgo; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.Comparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a + * NullDistanceAlgo. + */ +@ComparatorClass("null") +public class NullDistanceAlgo implements Comparator { + + public NullDistanceAlgo(Map params){ + } + + @Override + public double compare(Field a, Field b) { + return 0; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java new file mode 100644 index 000000000..e66ad0185 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java @@ -0,0 +1,63 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractSortedComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * The Class SortedJaroWinkler. + */ +@ComparatorClass("sortedJaroWinkler") +public class SortedJaroWinkler extends AbstractSortedComparator { + + public SortedJaroWinkler(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + */ + public SortedJaroWinkler(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java new file mode 100644 index 000000000..952414035 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java @@ -0,0 +1,63 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractSortedComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * The Class SortedJaroWinkler. + */ +@ComparatorClass("sortedLevel2JaroWinkler") +public class SortedLevel2JaroWinkler extends AbstractSortedComparator { + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + */ + public SortedLevel2JaroWinkler(final double weight) { + super(weight, new com.wcohen.ss.Level2JaroWinkler()); + } + + public SortedLevel2JaroWinkler(final Map params){ + super(params, new com.wcohen.ss.Level2JaroWinkler()); + } + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java new file mode 100644 index 000000000..d412f3941 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java @@ -0,0 +1,99 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.lang.StringUtils; + +import java.util.Map; + +/** + * The Class SubStringLevenstein. + */ +@ComparatorClass("subStringLevenstein") +public class SubStringLevenstein extends AbstractComparator { + + /** The limit. */ + protected int limit; + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + */ + public SubStringLevenstein(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + public SubStringLevenstein(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + this.limit = params.get("limit").intValue(); + } + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + * @param limit + * the limit + */ + public SubStringLevenstein(final double w, final int limit) { + super(w, new com.wcohen.ss.Levenstein()); + this.limit = limit; + } + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + * @param limit + * the limit + * @param ssalgo + * the ssalgo + */ + protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + this.limit = limit; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + */ + @Override + public double compare(final Field a, final Field b) { + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) + return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); + + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java new file mode 100644 index 000000000..d559e0647 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java @@ -0,0 +1,60 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.lang.StringUtils; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; + +@ComparatorClass("urlMatcher") +public class UrlMatcher extends Levenstein { + + private Map params; + + public UrlMatcher(Map params){ + super(params); + this.params = params; + } + + public UrlMatcher(double weight, Map params) { + super(weight); + this.params = params; + } + + public void setParams(Map params) { + this.params = params; + } + + @Override + public double compare(Field a, Field b) { + + final URL urlA = asUrl(getFirstValue(a)); + final URL urlB = asUrl(getFirstValue(b)); + + if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { + return 0.0; + } + + Double hostW = params.get("host").doubleValue(); + Double pathW = params.get("path").doubleValue(); + + if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { + return hostW * 0.5; + } + + return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath()); + } + + private URL asUrl(final String value) { + try { + return new URL(value); + } catch (MalformedURLException e) { + // should not happen as checked by pace typing + throw new IllegalStateException("invalid URL: " + value); + } + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java new file mode 100644 index 000000000..ef9abebe4 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -0,0 +1,110 @@ +package eu.dnetlib.pace.tree.support; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; + +import java.util.List; +import java.util.Map; + +public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator { + + /** The ssalgo. */ + protected AbstractStringDistance ssalgo; + + /** The weight. */ + protected double weight = 0.0; + + private Map params; + + protected AbstractComparator(Map params, final AbstractStringDistance ssalgo){ + this.params = params; + this.weight = 1.0; + this.ssalgo = ssalgo; + } + + /** + * Instantiates a new second string compare algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) { + this.ssalgo = ssalgo; + this.weight = weight; + } + + protected AbstractComparator(final AbstractStringDistance ssalgo){ + this.ssalgo = ssalgo; + } + + /** + * Normalize. + * + * @param d + * the d + * @return the double + */ + protected abstract double normalize(double d); + + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ + public double distance(final String a, final String b) { + + if (a.isEmpty() || b.isEmpty()) { + return -1; //return -1 if a field is missing + } + double score = ssalgo.score(a, b); + return normalize(score); + } + + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ + protected double distance(final List a, final List b) { + return distance(concat(a), concat(b)); + } + + @Override + public double compare(final Field a, final Field b) { + if (a.isEmpty() || b.isEmpty()) + return -1; + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue()); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b)); + + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + } + + /** + * To list. + * + * @param list + * the list + * @return the list + */ + protected List toList(final Field list) { + return ((FieldList) list).stringList(); + } + + public double getWeight(){ + return this.weight; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java new file mode 100644 index 000000000..18b5e67c6 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java @@ -0,0 +1,38 @@ +package eu.dnetlib.pace.tree.support; + +import com.google.common.collect.Lists; +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +public abstract class AbstractSortedComparator extends AbstractComparator { + + /** + * Instantiates a new sorted second string compare algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + protected AbstractSortedComparator(final Map params, final AbstractStringDistance ssalgo){ + super(params.get("weight").doubleValue(), ssalgo); + } + + @Override + protected List toList(final Field list) { + FieldList fl = (FieldList) list; + List values = Lists.newArrayList(fl.stringList()); + Collections.sort(values); + return values; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java new file mode 100644 index 000000000..8f7316ba5 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -0,0 +1,22 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.util.PaceException; + +public enum AggType { + + WEIGHTED_MEAN, + AVG, + SUM, + MAX, + MIN; + + public static AggType getEnum(String value) { + + try { + return AggType.valueOf(value); + } + catch (IllegalArgumentException e) { + throw new PaceException("Undefined aggregation type", e); + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java new file mode 100644 index 000000000..ea4ad7e9b --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java @@ -0,0 +1,9 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.model.Field; + +public interface Comparator { + + public double compare(Field a, Field b); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java new file mode 100644 index 000000000..8c3002eb6 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java @@ -0,0 +1,13 @@ +package eu.dnetlib.pace.tree.support; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ComparatorClass { + + public String value(); +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java similarity index 81% rename from dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java rename to dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index 710bf10f8..3dd4b0185 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -1,4 +1,4 @@ -package eu.dnetlib.pace.model; +package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.util.PaceException; import org.codehaus.jackson.map.ObjectMapper; @@ -14,14 +14,25 @@ public class FieldConf implements Serializable { private double weight = 1.0; //weight for the field (to be used in the aggregation) private Map params; //parameters + private boolean ignoreMissing; + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; + } + public FieldConf() { } - public FieldConf(String field, String comparator, double weight, Map params) { + public FieldConf(String field, String comparator, double weight, Map params, boolean ignoreMissing) { this.field = field; this.comparator = comparator; this.weight = weight; this.params = params; + this.ignoreMissing = ignoreMissing; } public String getField() { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java new file mode 100644 index 000000000..c16039587 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java @@ -0,0 +1,18 @@ +package eu.dnetlib.pace.tree.support; + +public enum MatchType { + + MATCH, + NO_MATCH, + UNDEFINED; + + public static MatchType parse(String value) { + + try { + return MatchType.valueOf(value); + } + catch (IllegalArgumentException e) { + return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java new file mode 100644 index 000000000..4af9f8126 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -0,0 +1,157 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; +import org.codehaus.jackson.map.ObjectMapper; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; + +public class TreeNodeDef implements Serializable { + + private List fields; + private AggType aggregation; + + private double threshold; + + private String positive; + private String negative; + private String undefined; + + boolean ignoreMissing; + + public TreeNodeDef(List fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) { + this.fields = fields; + this.aggregation = aggregation; + this.threshold = threshold; + this.positive = positive; + this.negative = negative; + this.undefined = undefined; + this.ignoreMissing = ignoreMissing; + } + + public TreeNodeDef() { + } + + public double evaluate(MapDocument doc1, MapDocument doc2) { + + DescriptiveStatistics stats = new DescriptiveStatistics(); + double sumWeights = 0.0; //for the weighted mean + + int missCount = 0; //counter for the number of misses + + for (FieldConf fieldConf : fields) { + + double weight = fieldConf.getWeight(); + + double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())); + + if (result >= 0.0) { //if the field is not missing + stats.addValue(weight * result); + sumWeights += weight; //sum weights, to be used in case of weighted mean + } + else { //if the field is missing + missCount += 1; + if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored + stats.addValue(weight * 0); + sumWeights += weight; + } + } + } + + //global ignoremissing (if one of the field is missing, return undefined) + if (!ignoreMissing && missCount>0) { + return -1; + } + + switch (aggregation){ + + case AVG: + return stats.getMean(); + case SUM: + return stats.getSum(); + case MAX: + return stats.getMax(); + case MIN: + return stats.getMin(); + case WEIGHTED_MEAN: + return stats.getSum()/sumWeights; + default: + return 0.0; + } + + } + + private Comparator comparator(final FieldConf field){ + + return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams()); + } + + public List getFields() { + return fields; + } + + public void setFields(List fields) { + this.fields = fields; + } + + public AggType getAggregation() { + return aggregation; + } + + public void setAggregation(AggType aggregation) { + this.aggregation = aggregation; + } + + public double getThreshold() { + return threshold; + } + + public void setThreshold(double threshold) { + this.threshold = threshold; + } + + public String getPositive() { + return positive; + } + + public void setPositive(String positive) { + this.positive = positive; + } + + public String getNegative() { + return negative; + } + + public void setNegative(String negative) { + this.negative = negative; + } + + public String getUndefined() { + return undefined; + } + + public void setUndefined(String undefined) { + this.undefined = undefined; + } + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; + } + + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 01da9c227..b1348e144 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -4,8 +4,8 @@ import com.google.common.collect.Lists; import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.WfConfig; -import eu.dnetlib.pace.distance.PaceDocumentDistance; -import eu.dnetlib.pace.distance.eval.ScoreResult; +//import eu.dnetlib.pace.distance.PaceDocumentDistance; +import eu.dnetlib.pace.distance.PairwiseComparison; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocumentComparator; @@ -116,7 +116,7 @@ public class BlockProcessor { private void process(final Queue queue, final Reporter context) { - final PaceDocumentDistance algo = new PaceDocumentDistance(); +// final PaceDocumentDistance algo = new PaceDocumentDistance(); while (!queue.isEmpty()) { @@ -150,21 +150,23 @@ public class BlockProcessor { if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - final ScoreResult sr = similarity(algo, pivot, curr); -// log.info(sr.toString()+"SCORE "+ sr.getScore()); - emitOutput(sr, idPivot, idCurr, context); - i++; + final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf); + + emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context); + +// final ScoreResult sr = similarity(algo, pivot, curr); +//// log.info(sr.toString()+"SCORE "+ sr.getScore()); +// emitOutput(sr, idPivot, idCurr, context); +// i++; } } } } } - private void emitOutput(final ScoreResult sr, final String idPivot, final String idCurr, final Reporter context) { - final double d = sr.getScore(); - - if (d >= dedupConf.getWf().getThreshold()) { + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { + if (result) { writeSimilarity(context, idPivot, idCurr); context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); } else { @@ -172,15 +174,6 @@ public class BlockProcessor { } } - private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) { - try { - return algo.between(a, b, dedupConf); - } catch(Throwable e) { - log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e); - throw new IllegalArgumentException(e); - } - } - private boolean mustSkip(final String idPivot) { return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java index fbbc9d77b..45e011fdd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java @@ -1390,7 +1390,7 @@ public class DiffPatchMatch { } /** - * Compute the Levenshtein distance; the number of inserted, deleted or + * Compute the Levenshtein compare; the number of inserted, deleted or * substituted characters. * @param diffs List of Diff objects. * @return Number of changes. @@ -1655,7 +1655,7 @@ public class DiffPatchMatch { score_threshold = score; best_loc = j - 1; if (best_loc > loc) { - // When passing loc, don't exceed our current distance from loc. + // When passing loc, don't exceed our current compare from loc. start = Math.max(1, 2 * loc - best_loc); } else { // Already passed loc, downhill from here on in. diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index d9868b690..c3b16c83e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -7,6 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass; import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.tree.support.Comparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; import org.reflections.Reflections; import java.io.Serializable; @@ -19,11 +21,13 @@ public class PaceResolver implements Serializable { public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition"); - public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.distance.algo"); + public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo"); + public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree"); private final Map> clusteringFunctions; private final Map> conditionAlgos; private final Map> distanceAlgos; + private final Map> comparators; public PaceResolver() { @@ -38,6 +42,10 @@ public class PaceResolver implements Serializable { this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream() .filter(DistanceAlgo.class::isAssignableFrom) .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl)); + + this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream() + .filter(Comparator.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class)cl)); } public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { @@ -64,4 +72,12 @@ public class PaceResolver implements Serializable { } } + public Comparator getComparator(String name, Map params) throws PaceException { + try { + return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { + throw new PaceException(name + " not found ", e); + } + } + } diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st index 97aef2d23..100b023f8 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st @@ -16,9 +16,9 @@ "pace" : { "clustering" : [ ], - "strictConditions" : [ + "sufficientConditions" : [ ], - "conditions" : [ + "necessaryConditions" : [ ], "model" : [ ], diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf index fd4fbbe79..53768c07c 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf @@ -16,10 +16,10 @@ { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } ], - "strictConditions" : [ + "sufficientConditions" : [ { "name" : "exactMatch", "fields" : [ "gridid" ] } ], - "conditions" : [ + "necessaryConditions" : [ { "name" : "exactMatch", "fields" : [ "country" ] }, { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } ],