diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java index 2b4aa29a1..5c7b4d130 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java @@ -9,7 +9,7 @@ import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; /** - * Abstract conditions needs a list of field names. + * Abstract necessaryConditions needs a list of field names. * * @author claudio * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java index 787ad9af1..1ea9caa16 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java @@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.FieldDef; /** - * Allows to express general conditions to be satisfied or not between two Documents. + * Allows to express general necessaryConditions to be satisfied or not between two Documents. * * @author claudio */ diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 7498c23cf..1a12e699f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -6,6 +6,7 @@ import java.util.Map; import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.tree.support.TreeNodeDef; /** * Interface for PACE configuration bean. @@ -21,6 +22,9 @@ public interface Config { */ public List model(); + + public Map decisionTree(); + /** * Field configuration definitions. * @@ -31,16 +35,16 @@ public interface Config { /** * Strict Pre-Condition definitions. * - * @return the list of conditions + * @return the list of necessaryConditions */ - public List strictConditions(); + public List sufficientConditions(); /** * Pre-Condition definitions. * - * @return the list of conditions + * @return the list of necessaryConditions */ - public List conditions(); + public List necessaryConditions(); /** * Clusterings. diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index 1cfcb089c..805f5c38e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -8,6 +8,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.function.BiFunction; +import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceException; import org.antlr.stringtemplate.StringTemplate; import org.apache.commons.io.IOUtils; @@ -114,6 +115,11 @@ public class DedupConfig implements Config, Serializable { } } + @Override + public Map decisionTree(){ + return getPace().getDecisionTree(); + } + @Override public List model() { return getPace().getModel(); @@ -125,12 +131,12 @@ public class DedupConfig implements Config, Serializable { } @Override - public List strictConditions() { + public List sufficientConditions() { return getPace().getStrictConditionAlgos(); } @Override - public List conditions() { + public List necessaryConditions() { return getPace().getConditionAlgos(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 4fde1dee9..d90cfe381 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -6,6 +6,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceResolver; import org.apache.commons.collections.CollectionUtils; import org.codehaus.jackson.annotate.JsonIgnore; @@ -18,9 +19,12 @@ import java.util.stream.Collectors; public class PaceConfig implements Serializable { private List model; - private List strictConditions; - private List conditions; + + private List sufficientConditions; + private List necessaryConditions; private List clustering; + private Map decisionTree; + private Map> blacklists; @JsonIgnore @@ -46,30 +50,30 @@ public class PaceConfig implements Serializable { this.model = model; } - public List getStrictConditions() { - return strictConditions; + public List getSufficientConditions() { + return sufficientConditions; } - public void setStrictConditions(final List strictConditions) { - this.strictConditions = strictConditions; + public void setSufficientConditions(final List sufficientConditions) { + this.sufficientConditions = sufficientConditions; } - public List getConditions() { - return conditions; + public List getNecessaryConditions() { + return necessaryConditions; } @JsonIgnore public List getConditionAlgos() { - return asConditionAlgos(getConditions()); + return asConditionAlgos(getNecessaryConditions()); } @JsonIgnore public List getStrictConditionAlgos() { - return asConditionAlgos(getStrictConditions()); + return asConditionAlgos(getSufficientConditions()); } - public void setConditions(final List conditions) { - this.conditions = conditions; + public void setNecessaryConditions(final List necessaryConditions) { + this.necessaryConditions = necessaryConditions; } public List getClustering() { @@ -80,6 +84,14 @@ public class PaceConfig implements Serializable { this.clustering = clustering; } + public Map getDecisionTree() { + return decisionTree; + } + + public void setDecisionTree(Map decisionTree) { + this.decisionTree = decisionTree; + } + public Map> getBlacklists() { return blacklists; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java index f9d189ff6..3304f3638 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java @@ -1,15 +1,15 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.eval.ScoreResult; -import eu.dnetlib.pace.model.Document; - -public abstract class AbstractDistance implements Distance { - - protected abstract Document toDocument(A a); - - @Override - public ScoreResult between(final A a, final A b, final Config config) { - return new DistanceScorer(config).distance(toDocument(a), toDocument(b)); - } -} +//package eu.dnetlib.pace.distance; +// +//import eu.dnetlib.pace.config.Config; +//import eu.dnetlib.pace.distance.eval.ScoreResult; +//import eu.dnetlib.pace.model.Document; +// +//public abstract class AbstractDistance implements Distance { +// +// protected abstract Document toDocument(A a); +// +// @Override +// public boolean between(final A a, final A b, final Config config) { +// return new PairwiseComparison(config).compare(toDocument(a), toDocument(b)); +// } +//} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java index 93a6e757a..b812bd1d9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java @@ -1,9 +1,8 @@ package eu.dnetlib.pace.distance; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.eval.ScoreResult; public interface Distance { - public ScoreResult between(A a, A b, Config config); + public boolean between(A a, A b, Config config); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java index 5e4f69f51..80b2191cc 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java @@ -5,7 +5,7 @@ import eu.dnetlib.pace.model.Field; import java.util.Map; /** - * Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two + * Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two * objects. */ public interface DistanceAlgo { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java deleted file mode 100644 index bb3c37ed6..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java +++ /dev/null @@ -1,126 +0,0 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.condition.ConditionAlgo; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.eval.ConditionEvalMap; -import eu.dnetlib.pace.distance.eval.DistanceEval; -import eu.dnetlib.pace.distance.eval.DistanceEvalMap; -import eu.dnetlib.pace.distance.eval.ScoreResult; -import eu.dnetlib.pace.model.*; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import java.util.Collection; -import java.util.List; -import java.util.stream.Collectors; - -/** - * The distance between two documents is given by the weighted mean of the field distances - */ -public class DistanceScorer { - - private static final Log log = LogFactory.getLog(DistanceScorer.class); - - private Config config; - - public DistanceScorer(final Config config) { - this.config = config; - } - - public ScoreResult distance(final Document a, final Document b) { - final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison - - sr.setStrictConditions(verify(a, b, config.strictConditions())); - sr.setConditions(verify(a, b, config.conditions())); - - final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model())); - - for (final FieldDef fd : config.model()) { - - dMap.updateDistance(fieldDistance(a, b, fd)); - } - sr.setDistances(dMap); - return sr; - } - - private ConditionEvalMap verify(final Document a, final Document b, final List conditions) { - final ConditionEvalMap res = new ConditionEvalMap(); - - for (final ConditionAlgo cd : conditions) { - final ConditionEvalMap map = cd.verify(a, b); - res.mergeFrom(map); - - // commented out shortcuts - /* - if (map.anyNegative()) { - return res; - } - */ - - //if (strict && (res < 0)) return -1; - //cond += verify; - } - return res; - } - - private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) { - - final double w = fd.getWeight(); - final Field va = getValue(a, fd); - final Field vb = getValue(b, fd); - - final DistanceEval de = new DistanceEval(fd, va, vb); - if ((w == 0)) return de; // optimization for 0 weight - else { - if (va.isEmpty() || vb.isEmpty()) { - if (fd.isIgnoreMissing()) { - de.setDistance(-1); - } else { - de.setDistance(w); - } - } else { - if (va.getType().equals(vb.getType())) { - de.setDistance(w * fd.distanceAlgo().distance(va, vb)); - } else { - throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType())); - } - } - return de; - } - } - - private Field getValue(final Document d, final FieldDef fd) { - final Field v = d.values(fd.getName()); - if (fd.getLength() > 0) { - - if (v instanceof FieldValueImpl) { - ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength())); - } else if (v instanceof FieldListImpl) { - List strings = ((FieldListImpl) v).stringList(); - strings = strings.stream() - .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) - .map(s -> StringUtils.substring(s, 0, fd.getLength())) - .collect(Collectors.toList()); - ((FieldListImpl) v).clear(); - ((FieldListImpl) v).addAll(strings.stream() - .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) - .map(s -> StringUtils.substring(s, 0, fd.getLength())) - .map(s -> new FieldValueImpl(v.getType(), v.getName(), s)) - .collect(Collectors.toList())); - } - } - - return v; - } - - private double sumWeights(final Collection fields) { - double sum = 0.0; - for (final FieldDef fd : fields) { - sum += fd.getWeight(); - } - return sum; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java index 7651479ee..9c75bfcf8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java @@ -1,12 +1,12 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.model.Document; - -public class PaceDocumentDistance extends AbstractDistance { - - @Override - protected Document toDocument(Document a) { - return a; - } - -} +//package eu.dnetlib.pace.distance; +// +//import eu.dnetlib.pace.model.Document; +// +//public class PaceDocumentDistance extends AbstractDistance { +// +// @Override +// protected Document toDocument(Document a) { +// return a; +// } +// +//} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java new file mode 100644 index 000000000..125919d0f --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java @@ -0,0 +1,125 @@ +package eu.dnetlib.pace.distance; + +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.distance.eval.ConditionEvalMap; +import eu.dnetlib.pace.model.*; +import eu.dnetlib.pace.tree.support.MatchType; +import eu.dnetlib.pace.tree.support.TreeNodeDef; +import eu.dnetlib.pace.util.PaceException; +import eu.dnetlib.pace.util.Reporter; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.List; +import java.util.Map; + +/** + * The compare between two documents is given by the weighted mean of the field distances + */ +public class PairwiseComparison { + + private static final Log log = LogFactory.getLog(PairwiseComparison.class); + + private Config config; + + public PairwiseComparison(final Config config) { + this.config = config; + } + + public boolean compare(final MapDocument a, final MapDocument b) { + + //verify sufficientConditions + if (verify(a, b, config.sufficientConditions()).result() > 0) + return true; + + //verify necessaryConditions + if (verify(a, b, config.necessaryConditions()).result() < 0) + return false; + + //evaluate the decision tree + return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH; + } + + private ConditionEvalMap verify(final Document a, final Document b, final List conditions) { + final ConditionEvalMap res = new ConditionEvalMap(); + + for (final ConditionAlgo cd : conditions) { + final ConditionEvalMap map = cd.verify(a, b); + res.mergeFrom(map); + + // commented out shortcuts + /* + if (map.anyNegative()) { + return res; + } + */ + + //if (strict && (res < 0)) return -1; + //cond += verify; + } + return res; + } + + public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map decisionTree){ + + String current = "start"; + double similarity; + + while (MatchType.parse(current)==MatchType.UNDEFINED) { + + TreeNodeDef currentNode = decisionTree.get(current); + //throw an exception if the node doesn't exist + if (currentNode == null) + throw new PaceException("The Tree Node doesn't exist: " + current); + + similarity = currentNode.evaluate(doc1, doc2); + + if (similarity == -1) { + current = currentNode.getUndefined(); + } + else if (similarity>=currentNode.getThreshold()){ + current = currentNode.getPositive(); + } + else { + current = currentNode.getNegative(); + } + + } + + return MatchType.parse(current); + } + +// private Field getValue(final Document d, final FieldDef fd) { +// final Field v = d.values(fd.getName()); +// if (fd.getLength() > 0) { +// +// if (v instanceof FieldValueImpl) { +// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength())); +// } else if (v instanceof FieldListImpl) { +// List strings = ((FieldListImpl) v).stringList(); +// strings = strings.stream() +// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) +// .map(s -> StringUtils.substring(s, 0, fd.getLength())) +// .collect(Collectors.toList()); +// ((FieldListImpl) v).clear(); +// ((FieldListImpl) v).addAll(strings.stream() +// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) +// .map(s -> StringUtils.substring(s, 0, fd.getLength())) +// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s)) +// .collect(Collectors.toList())); +// } +// } +// +// return v; +// } +// +// private double sumWeights(final Collection fields) { +// double sum = 0.0; +// for (final FieldDef fd : fields) { +// sum += fd.getWeight(); +// } +// return sum; +// } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java index 9cc35298f..d3ccccec5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java @@ -12,7 +12,7 @@ import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; /** - * For the rest of the fields delegate the distance measure to the second string library. + * For the rest of the fields delegate the compare measure to the second string library. */ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo { @@ -35,7 +35,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp } /** - * Instantiates a new second string distance algo. + * Instantiates a new second string compare algo. * * @param weight * the weight @@ -90,7 +90,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + * @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override public double distance(final Field a, final Field b) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java index 503dc33b2..ae0ef9d00 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.DistanceScorer; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java index ff8b34bf3..6303f8e2a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java @@ -7,7 +7,7 @@ import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import java.util.Map; /** - * Compared distance between two titles, ignoring version numbers. Suitable for Software entities. + * Compared compare between two titles, ignoring version numbers. Suitable for Software entities. */ @DistanceClass("LevensteinTitleIgnoreVersion") public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java index 8afc45fd6..c1b1d7223 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java @@ -7,7 +7,7 @@ import eu.dnetlib.pace.model.Field; import java.util.Map; /** - * Not all fields of a document need to partecipate in the distance measure. We model those fields as having a + * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a * NullDistanceAlgo. */ @DistanceClass("Null") diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java index e3175a13e..d3aa58f99 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java @@ -40,7 +40,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override public double getWeight() { @@ -50,7 +50,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override protected double normalize(final double d) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java index 1a12e1688..2523153ed 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java @@ -40,7 +40,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override public double getWeight() { @@ -50,7 +50,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override protected double normalize(final double d) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java index 8a9c51402..f72750521 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java @@ -12,12 +12,12 @@ import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; /** - * For the rest of the fields delegate the distance measure to the second string library. + * For the rest of the fields delegate the compare measure to the second string library. */ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo { /** - * Instantiates a new sorted second string distance algo. + * Instantiates a new sorted second string compare algo. * * @param weight * the weight @@ -35,7 +35,7 @@ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanc /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field) */ @Override protected List toList(final Field list) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java index 8f0c024c7..b788fade4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java @@ -66,7 +66,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override public double distance(final Field a, final Field b) { @@ -79,7 +79,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override public double getWeight() { @@ -89,7 +89,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { /* * (non-Javadoc) * - * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override protected double normalize(final double d) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java deleted file mode 100644 index 764e60354..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java +++ /dev/null @@ -1,32 +0,0 @@ -package eu.dnetlib.pace.distance.eval; - -import java.util.HashMap; - -/** - * Created by claudio on 10/03/16. - */ -public class DistanceEvalMap extends HashMap { - - private double sumWeights; - - private double sumDistances = 0.0; - - public DistanceEvalMap(final double sumWeights) { - this.sumWeights = sumWeights; - } - - public void updateDistance(final DistanceEval d) { - - put(d.getFieldDef().getName(), d); - if (d.getDistance() >= 0) { - sumDistances += d.getDistance(); - } else { - sumWeights -= d.getFieldDef().getWeight(); - } - } - - public double distance() { - return sumWeights == 0 ? 0 : sumDistances / sumWeights; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java deleted file mode 100644 index 62b7d85b4..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java +++ /dev/null @@ -1,62 +0,0 @@ -package eu.dnetlib.pace.distance.eval; - -import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; - -import java.io.IOException; -import java.io.Serializable; - -/** - * Created by claudio on 09/03/16. - */ -public class ScoreResult implements Serializable { - - private ConditionEvalMap strictConditions; - - private ConditionEvalMap conditions; - - private DistanceEvalMap distances; - - public double getScore() { - - if (getStrictConditions().result() > 0) return 1.0; - // if (getStrictConditions().result() < 0) return 0.0; - if (getConditions().result() < 0) return 0.0; - - return getDistances().distance(); - } - - - public ConditionEvalMap getStrictConditions() { - return strictConditions; - } - - public void setStrictConditions(final ConditionEvalMap strictConditions) { - this.strictConditions = strictConditions; - } - - public ConditionEvalMap getConditions() { - return conditions; - } - - public void setConditions(final ConditionEvalMap conditions) { - this.conditions = conditions; - } - - public DistanceEvalMap getDistances() { - return distances; - } - - public void setDistances(final DistanceEvalMap distances) { - this.distances = distances; - } - - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("unable to serialise " + this.getClass().getName(), e); - } - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index 996471338..c76b63716 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -14,25 +14,29 @@ import java.util.List; import java.util.Map; /** - * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm. + * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm. */ public class FieldDef implements Serializable { public final static String PATH_SEPARATOR = "/"; - private String algo; - private String name; private String path; - private boolean ignoreMissing; - private Type type; - private boolean overrideMatch; + private boolean ignoreMissing; - private double weight; + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; + } + + private boolean overrideMatch; /** * Sets maximum size for the repeatable fields in the model. -1 for unbounded size. @@ -74,20 +78,6 @@ public class FieldDef implements Serializable { return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath())); } - public DistanceAlgo distanceAlgo() { - - if (params == null) { - params = new HashMap<>(); - } - - params.put("weight", getWeight()); - return PaceConfig.resolver.getDistanceAlgo(getAlgo(), params); - } - - public boolean isIgnoreMissing() { - return ignoreMissing; - } - public Type getType() { return type; } @@ -104,23 +94,6 @@ public class FieldDef implements Serializable { this.overrideMatch = overrideMatch; } - public double getWeight() { - return weight; - } - - public void setWeight(final double weight) { - this.weight = weight; - } - - public String getAlgo() { - return algo; - } - - public void setAlgo(final String algo) { - this.algo = algo; - } - - public int getSize() { return size; } @@ -153,10 +126,6 @@ public class FieldDef implements Serializable { this.path = path; } - public void setIgnoreMissing(boolean ignoreMissing) { - this.ignoreMissing = ignoreMissing; - } - @Override public String toString() { return new Gson().toJson(this); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java new file mode 100644 index 000000000..7e4b18788 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java @@ -0,0 +1,42 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("alwaysMatch") +public class AlwaysMatch extends AbstractComparator { + + public AlwaysMatch(final Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public AlwaysMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return 1.0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java new file mode 100644 index 000000000..5a844de60 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -0,0 +1,38 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("exactMatch") +public class ExactMatch extends AbstractComparator { + + public ExactMatch(Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public ExactMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return a.equals(b) ? 1.0 : 0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java new file mode 100644 index 000000000..f2a696b47 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java @@ -0,0 +1,46 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.io.Serializable; +import java.util.Map; + +//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +@ComparatorClass("jaroWinkler") +public class JaroWinkler extends AbstractComparator { + + public JaroWinkler(Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public JaroWinkler(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(String a, String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + return normalize(ssalgo.score(ca, cb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java new file mode 100644 index 000000000..b6b593de7 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -0,0 +1,78 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; +import java.util.Set; + +@ComparatorClass("jaroWinklerNormalizedName") +public class JaroWinklerNormalizedName extends AbstractComparator { + + private Map params; + + public JaroWinklerNormalizedName(Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + this.params = params; + } + + public JaroWinklerNormalizedName(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(String a, String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + ca = normalize(ca); + cb = normalize(cb); + + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); + + Set keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue()); + Set keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue()); + + Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); + Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); + + if (sameCity(cities1,cities2)) { + + if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { + + ca = removeKeywords(ca, keywords1); + ca = removeKeywords(ca, cities1); + cb = removeKeywords(cb, keywords2); + cb = removeKeywords(cb, cities2); + + if (ca.isEmpty() && cb.isEmpty()) + return 1.0; + else + return normalize(ssalgo.score(ca,cb)); + + } + } + + return 0.0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java new file mode 100644 index 000000000..99d7a8668 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java @@ -0,0 +1,46 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +@ComparatorClass("jaroWinklerTitle") +public class JaroWinklerTitle extends AbstractComparator { + + public JaroWinklerTitle(Map params){ + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public JaroWinklerTitle(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(String a, String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + boolean check = checkNumbers(ca, cb); + return check ? 0.5 : normalize(ssalgo.score(ca, cb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java new file mode 100644 index 000000000..d4d5b8f1e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java @@ -0,0 +1,36 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("level2JaroWinkler") +public class Level2JaroWinkler extends AbstractComparator { + + public Level2JaroWinkler(Map params){ + super(params, new com.wcohen.ss.Level2JaroWinkler()); + } + + public Level2JaroWinkler(double w) { + super(w, new com.wcohen.ss.Level2JaroWinkler()); + } + + protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java new file mode 100644 index 000000000..41a38c1ee --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -0,0 +1,51 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("level2JaroWinklerTitle") +public class Level2JaroWinklerTitle extends AbstractComparator { + + public Level2JaroWinklerTitle(Map params){ + super(params, new com.wcohen.ss.Level2JaroWinkler()); + } + + public Level2JaroWinklerTitle(final double w) { + super(w, new com.wcohen.ss.Level2JaroWinkler()); + } + + protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + final String ca = cleanup(a); + final String cb = cleanup(b); + + final boolean check = checkNumbers(ca, cb); + + if (check) return 0.5; + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return ssalgo.score(cca, ccb); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java new file mode 100644 index 000000000..1a598ecbe --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java @@ -0,0 +1,36 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("level2Levenstein") +public class Level2Levenstein extends AbstractComparator { + + public Level2Levenstein(Map params){ + super(params, new com.wcohen.ss.Level2Levenstein()); + } + + public Level2Levenstein(double w) { + super(w, new com.wcohen.ss.Level2Levenstein()); + } + + protected Level2Levenstein(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java new file mode 100644 index 000000000..ada70fdc0 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java @@ -0,0 +1,36 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("levenstein") +public class Levenstein extends AbstractComparator { + + public Levenstein(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public Levenstein(double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected Levenstein(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java new file mode 100644 index 000000000..45459f4dc --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -0,0 +1,59 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.Map; + +@ComparatorClass("levensteinTitle") +public class LevensteinTitle extends AbstractComparator { + + private static final Log log = LogFactory.getLog(LevensteinTitle.class); + + public LevensteinTitle(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public LevensteinTitle(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + final String ca = cleanup(a); + final String cb = cleanup(b); + + final boolean check = checkNumbers(ca, cb); + + if (check) return 0.5; + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); + } + + private double normalize(final double score, final int la, final int lb) { + return 1 - (Math.abs(score) / Math.max(la, lb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java new file mode 100644 index 000000000..342cee753 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -0,0 +1,60 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * Compared compare between two titles, ignoring version numbers. Suitable for Software entities. + */ +@ComparatorClass("levensteinTitleIgnoreVersion") +public class LevensteinTitleIgnoreVersion extends AbstractComparator { + + public LevensteinTitleIgnoreVersion(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public LevensteinTitleIgnoreVersion(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim(); + cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim(); + + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); + } + + private double normalize(final double score, final int la, final int lb) { + return 1 - (Math.abs(score) / Math.max(la, lb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java new file mode 100644 index 000000000..d7251f18e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java @@ -0,0 +1,41 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("mustBeDifferent") +public class MustBeDifferent extends AbstractComparator { + + public MustBeDifferent(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public MustBeDifferent(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return !a.equals(b) ? 1.0 : 0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java new file mode 100644 index 000000000..7c5c3f4ae --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java @@ -0,0 +1,25 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.distance.DistanceAlgo; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.Comparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a + * NullDistanceAlgo. + */ +@ComparatorClass("null") +public class NullDistanceAlgo implements Comparator { + + public NullDistanceAlgo(Map params){ + } + + @Override + public double compare(Field a, Field b) { + return 0; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java new file mode 100644 index 000000000..e66ad0185 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java @@ -0,0 +1,63 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractSortedComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * The Class SortedJaroWinkler. + */ +@ComparatorClass("sortedJaroWinkler") +public class SortedJaroWinkler extends AbstractSortedComparator { + + public SortedJaroWinkler(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + */ + public SortedJaroWinkler(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java new file mode 100644 index 000000000..952414035 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java @@ -0,0 +1,63 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo; +import eu.dnetlib.pace.tree.support.AbstractSortedComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +/** + * The Class SortedJaroWinkler. + */ +@ComparatorClass("sortedLevel2JaroWinkler") +public class SortedLevel2JaroWinkler extends AbstractSortedComparator { + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + */ + public SortedLevel2JaroWinkler(final double weight) { + super(weight, new com.wcohen.ss.Level2JaroWinkler()); + } + + public SortedLevel2JaroWinkler(final Map params){ + super(params, new com.wcohen.ss.Level2JaroWinkler()); + } + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java new file mode 100644 index 000000000..d412f3941 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java @@ -0,0 +1,99 @@ +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.lang.StringUtils; + +import java.util.Map; + +/** + * The Class SubStringLevenstein. + */ +@ComparatorClass("subStringLevenstein") +public class SubStringLevenstein extends AbstractComparator { + + /** The limit. */ + protected int limit; + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + */ + public SubStringLevenstein(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + public SubStringLevenstein(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + this.limit = params.get("limit").intValue(); + } + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + * @param limit + * the limit + */ + public SubStringLevenstein(final double w, final int limit) { + super(w, new com.wcohen.ss.Levenstein()); + this.limit = limit; + } + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + * @param limit + * the limit + * @param ssalgo + * the ssalgo + */ + protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + this.limit = limit; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + */ + @Override + public double compare(final Field a, final Field b) { + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) + return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); + + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java new file mode 100644 index 000000000..d559e0647 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java @@ -0,0 +1,60 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.lang.StringUtils; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; + +@ComparatorClass("urlMatcher") +public class UrlMatcher extends Levenstein { + + private Map params; + + public UrlMatcher(Map params){ + super(params); + this.params = params; + } + + public UrlMatcher(double weight, Map params) { + super(weight); + this.params = params; + } + + public void setParams(Map params) { + this.params = params; + } + + @Override + public double compare(Field a, Field b) { + + final URL urlA = asUrl(getFirstValue(a)); + final URL urlB = asUrl(getFirstValue(b)); + + if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { + return 0.0; + } + + Double hostW = params.get("host").doubleValue(); + Double pathW = params.get("path").doubleValue(); + + if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { + return hostW * 0.5; + } + + return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath()); + } + + private URL asUrl(final String value) { + try { + return new URL(value); + } catch (MalformedURLException e) { + // should not happen as checked by pace typing + throw new IllegalStateException("invalid URL: " + value); + } + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java new file mode 100644 index 000000000..ef9abebe4 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -0,0 +1,110 @@ +package eu.dnetlib.pace.tree.support; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; + +import java.util.List; +import java.util.Map; + +public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator { + + /** The ssalgo. */ + protected AbstractStringDistance ssalgo; + + /** The weight. */ + protected double weight = 0.0; + + private Map params; + + protected AbstractComparator(Map params, final AbstractStringDistance ssalgo){ + this.params = params; + this.weight = 1.0; + this.ssalgo = ssalgo; + } + + /** + * Instantiates a new second string compare algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) { + this.ssalgo = ssalgo; + this.weight = weight; + } + + protected AbstractComparator(final AbstractStringDistance ssalgo){ + this.ssalgo = ssalgo; + } + + /** + * Normalize. + * + * @param d + * the d + * @return the double + */ + protected abstract double normalize(double d); + + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ + public double distance(final String a, final String b) { + + if (a.isEmpty() || b.isEmpty()) { + return -1; //return -1 if a field is missing + } + double score = ssalgo.score(a, b); + return normalize(score); + } + + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ + protected double distance(final List a, final List b) { + return distance(concat(a), concat(b)); + } + + @Override + public double compare(final Field a, final Field b) { + if (a.isEmpty() || b.isEmpty()) + return -1; + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue()); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b)); + + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + } + + /** + * To list. + * + * @param list + * the list + * @return the list + */ + protected List toList(final Field list) { + return ((FieldList) list).stringList(); + } + + public double getWeight(){ + return this.weight; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java new file mode 100644 index 000000000..18b5e67c6 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java @@ -0,0 +1,38 @@ +package eu.dnetlib.pace.tree.support; + +import com.google.common.collect.Lists; +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +public abstract class AbstractSortedComparator extends AbstractComparator { + + /** + * Instantiates a new sorted second string compare algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + protected AbstractSortedComparator(final Map params, final AbstractStringDistance ssalgo){ + super(params.get("weight").doubleValue(), ssalgo); + } + + @Override + protected List toList(final Field list) { + FieldList fl = (FieldList) list; + List values = Lists.newArrayList(fl.stringList()); + Collections.sort(values); + return values; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java new file mode 100644 index 000000000..8f7316ba5 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -0,0 +1,22 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.util.PaceException; + +public enum AggType { + + WEIGHTED_MEAN, + AVG, + SUM, + MAX, + MIN; + + public static AggType getEnum(String value) { + + try { + return AggType.valueOf(value); + } + catch (IllegalArgumentException e) { + throw new PaceException("Undefined aggregation type", e); + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java new file mode 100644 index 000000000..ea4ad7e9b --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java @@ -0,0 +1,9 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.model.Field; + +public interface Comparator { + + public double compare(Field a, Field b); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java new file mode 100644 index 000000000..8c3002eb6 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java @@ -0,0 +1,13 @@ +package eu.dnetlib.pace.tree.support; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ComparatorClass { + + public String value(); +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java similarity index 81% rename from dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java rename to dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index 710bf10f8..3dd4b0185 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -1,4 +1,4 @@ -package eu.dnetlib.pace.model; +package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.util.PaceException; import org.codehaus.jackson.map.ObjectMapper; @@ -14,14 +14,25 @@ public class FieldConf implements Serializable { private double weight = 1.0; //weight for the field (to be used in the aggregation) private Map params; //parameters + private boolean ignoreMissing; + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; + } + public FieldConf() { } - public FieldConf(String field, String comparator, double weight, Map params) { + public FieldConf(String field, String comparator, double weight, Map params, boolean ignoreMissing) { this.field = field; this.comparator = comparator; this.weight = weight; this.params = params; + this.ignoreMissing = ignoreMissing; } public String getField() { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java new file mode 100644 index 000000000..c16039587 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java @@ -0,0 +1,18 @@ +package eu.dnetlib.pace.tree.support; + +public enum MatchType { + + MATCH, + NO_MATCH, + UNDEFINED; + + public static MatchType parse(String value) { + + try { + return MatchType.valueOf(value); + } + catch (IllegalArgumentException e) { + return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java new file mode 100644 index 000000000..4af9f8126 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -0,0 +1,157 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; +import org.codehaus.jackson.map.ObjectMapper; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; + +public class TreeNodeDef implements Serializable { + + private List fields; + private AggType aggregation; + + private double threshold; + + private String positive; + private String negative; + private String undefined; + + boolean ignoreMissing; + + public TreeNodeDef(List fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) { + this.fields = fields; + this.aggregation = aggregation; + this.threshold = threshold; + this.positive = positive; + this.negative = negative; + this.undefined = undefined; + this.ignoreMissing = ignoreMissing; + } + + public TreeNodeDef() { + } + + public double evaluate(MapDocument doc1, MapDocument doc2) { + + DescriptiveStatistics stats = new DescriptiveStatistics(); + double sumWeights = 0.0; //for the weighted mean + + int missCount = 0; //counter for the number of misses + + for (FieldConf fieldConf : fields) { + + double weight = fieldConf.getWeight(); + + double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())); + + if (result >= 0.0) { //if the field is not missing + stats.addValue(weight * result); + sumWeights += weight; //sum weights, to be used in case of weighted mean + } + else { //if the field is missing + missCount += 1; + if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored + stats.addValue(weight * 0); + sumWeights += weight; + } + } + } + + //global ignoremissing (if one of the field is missing, return undefined) + if (!ignoreMissing && missCount>0) { + return -1; + } + + switch (aggregation){ + + case AVG: + return stats.getMean(); + case SUM: + return stats.getSum(); + case MAX: + return stats.getMax(); + case MIN: + return stats.getMin(); + case WEIGHTED_MEAN: + return stats.getSum()/sumWeights; + default: + return 0.0; + } + + } + + private Comparator comparator(final FieldConf field){ + + return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams()); + } + + public List getFields() { + return fields; + } + + public void setFields(List fields) { + this.fields = fields; + } + + public AggType getAggregation() { + return aggregation; + } + + public void setAggregation(AggType aggregation) { + this.aggregation = aggregation; + } + + public double getThreshold() { + return threshold; + } + + public void setThreshold(double threshold) { + this.threshold = threshold; + } + + public String getPositive() { + return positive; + } + + public void setPositive(String positive) { + this.positive = positive; + } + + public String getNegative() { + return negative; + } + + public void setNegative(String negative) { + this.negative = negative; + } + + public String getUndefined() { + return undefined; + } + + public void setUndefined(String undefined) { + this.undefined = undefined; + } + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public void setIgnoreMissing(boolean ignoreMissing) { + this.ignoreMissing = ignoreMissing; + } + + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 01da9c227..b1348e144 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -4,8 +4,8 @@ import com.google.common.collect.Lists; import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.WfConfig; -import eu.dnetlib.pace.distance.PaceDocumentDistance; -import eu.dnetlib.pace.distance.eval.ScoreResult; +//import eu.dnetlib.pace.distance.PaceDocumentDistance; +import eu.dnetlib.pace.distance.PairwiseComparison; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocumentComparator; @@ -116,7 +116,7 @@ public class BlockProcessor { private void process(final Queue queue, final Reporter context) { - final PaceDocumentDistance algo = new PaceDocumentDistance(); +// final PaceDocumentDistance algo = new PaceDocumentDistance(); while (!queue.isEmpty()) { @@ -150,21 +150,23 @@ public class BlockProcessor { if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - final ScoreResult sr = similarity(algo, pivot, curr); -// log.info(sr.toString()+"SCORE "+ sr.getScore()); - emitOutput(sr, idPivot, idCurr, context); - i++; + final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf); + + emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context); + +// final ScoreResult sr = similarity(algo, pivot, curr); +//// log.info(sr.toString()+"SCORE "+ sr.getScore()); +// emitOutput(sr, idPivot, idCurr, context); +// i++; } } } } } - private void emitOutput(final ScoreResult sr, final String idPivot, final String idCurr, final Reporter context) { - final double d = sr.getScore(); - - if (d >= dedupConf.getWf().getThreshold()) { + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { + if (result) { writeSimilarity(context, idPivot, idCurr); context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); } else { @@ -172,15 +174,6 @@ public class BlockProcessor { } } - private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) { - try { - return algo.between(a, b, dedupConf); - } catch(Throwable e) { - log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e); - throw new IllegalArgumentException(e); - } - } - private boolean mustSkip(final String idPivot) { return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java index fbbc9d77b..45e011fdd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java @@ -1390,7 +1390,7 @@ public class DiffPatchMatch { } /** - * Compute the Levenshtein distance; the number of inserted, deleted or + * Compute the Levenshtein compare; the number of inserted, deleted or * substituted characters. * @param diffs List of Diff objects. * @return Number of changes. @@ -1655,7 +1655,7 @@ public class DiffPatchMatch { score_threshold = score; best_loc = j - 1; if (best_loc > loc) { - // When passing loc, don't exceed our current distance from loc. + // When passing loc, don't exceed our current compare from loc. start = Math.max(1, 2 * loc - best_loc); } else { // Already passed loc, downhill from here on in. diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index d9868b690..c3b16c83e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -7,6 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass; import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.tree.support.Comparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; import org.reflections.Reflections; import java.io.Serializable; @@ -19,11 +21,13 @@ public class PaceResolver implements Serializable { public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition"); - public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.distance.algo"); + public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo"); + public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree"); private final Map> clusteringFunctions; private final Map> conditionAlgos; private final Map> distanceAlgos; + private final Map> comparators; public PaceResolver() { @@ -38,6 +42,10 @@ public class PaceResolver implements Serializable { this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream() .filter(DistanceAlgo.class::isAssignableFrom) .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl)); + + this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream() + .filter(Comparator.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class)cl)); } public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { @@ -64,4 +72,12 @@ public class PaceResolver implements Serializable { } } + public Comparator getComparator(String name, Map params) throws PaceException { + try { + return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { + throw new PaceException(name + " not found ", e); + } + } + } diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st index 97aef2d23..100b023f8 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st @@ -16,9 +16,9 @@ "pace" : { "clustering" : [ ], - "strictConditions" : [ + "sufficientConditions" : [ ], - "conditions" : [ + "necessaryConditions" : [ ], "model" : [ ], diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf index fd4fbbe79..53768c07c 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf @@ -16,10 +16,10 @@ { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } ], - "strictConditions" : [ + "sufficientConditions" : [ { "name" : "exactMatch", "fields" : [ "gridid" ] } ], - "conditions" : [ + "necessaryConditions" : [ { "name" : "exactMatch", "fields" : [ "country" ] }, { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } ],