implementation of the decision tree. It takes place of the distance algos, necessaryConditions and sufficientConditions are still there. The model contains only path, type and name of the field. ignoreMissing is still in the model because it is used by the conditions.

This commit is contained in:
miconis 2019-08-09 10:08:34 +02:00
parent f2136e1024
commit a5c5d2f01b
52 changed files with 1517 additions and 357 deletions

View File

@ -9,7 +9,7 @@ import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
/** /**
* Abstract conditions needs a list of field names. * Abstract necessaryConditions needs a list of field names.
* *
* @author claudio * @author claudio
* *

View File

@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
/** /**
* Allows to express general conditions to be satisfied or not between two Documents. * Allows to express general necessaryConditions to be satisfied or not between two Documents.
* *
* @author claudio * @author claudio
*/ */

View File

@ -6,6 +6,7 @@ import java.util.Map;
import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
/** /**
* Interface for PACE configuration bean. * Interface for PACE configuration bean.
@ -21,6 +22,9 @@ public interface Config {
*/ */
public List<FieldDef> model(); public List<FieldDef> model();
public Map<String, TreeNodeDef> decisionTree();
/** /**
* Field configuration definitions. * Field configuration definitions.
* *
@ -31,16 +35,16 @@ public interface Config {
/** /**
* Strict Pre-Condition definitions. * Strict Pre-Condition definitions.
* *
* @return the list of conditions * @return the list of necessaryConditions
*/ */
public List<ConditionAlgo> strictConditions(); public List<ConditionAlgo> sufficientConditions();
/** /**
* Pre-Condition definitions. * Pre-Condition definitions.
* *
* @return the list of conditions * @return the list of necessaryConditions
*/ */
public List<ConditionAlgo> conditions(); public List<ConditionAlgo> necessaryConditions();
/** /**
* Clusterings. * Clusterings.

View File

@ -8,6 +8,7 @@ import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate; import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -114,6 +115,11 @@ public class DedupConfig implements Config, Serializable {
} }
} }
@Override
public Map<String, TreeNodeDef> decisionTree(){
return getPace().getDecisionTree();
}
@Override @Override
public List<FieldDef> model() { public List<FieldDef> model() {
return getPace().getModel(); return getPace().getModel();
@ -125,12 +131,12 @@ public class DedupConfig implements Config, Serializable {
} }
@Override @Override
public List<ConditionAlgo> strictConditions() { public List<ConditionAlgo> sufficientConditions() {
return getPace().getStrictConditionAlgos(); return getPace().getStrictConditionAlgos();
} }
@Override @Override
public List<ConditionAlgo> conditions() { public List<ConditionAlgo> necessaryConditions() {
return getPace().getConditionAlgos(); return getPace().getConditionAlgos();
} }

View File

@ -6,6 +6,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.CondDef;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceResolver; import eu.dnetlib.pace.util.PaceResolver;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.annotate.JsonIgnore;
@ -18,9 +19,12 @@ import java.util.stream.Collectors;
public class PaceConfig implements Serializable { public class PaceConfig implements Serializable {
private List<FieldDef> model; private List<FieldDef> model;
private List<CondDef> strictConditions;
private List<CondDef> conditions; private List<CondDef> sufficientConditions;
private List<CondDef> necessaryConditions;
private List<ClusteringDef> clustering; private List<ClusteringDef> clustering;
private Map<String, TreeNodeDef> decisionTree;
private Map<String, List<String>> blacklists; private Map<String, List<String>> blacklists;
@JsonIgnore @JsonIgnore
@ -46,30 +50,30 @@ public class PaceConfig implements Serializable {
this.model = model; this.model = model;
} }
public List<CondDef> getStrictConditions() { public List<CondDef> getSufficientConditions() {
return strictConditions; return sufficientConditions;
} }
public void setStrictConditions(final List<CondDef> strictConditions) { public void setSufficientConditions(final List<CondDef> sufficientConditions) {
this.strictConditions = strictConditions; this.sufficientConditions = sufficientConditions;
} }
public List<CondDef> getConditions() { public List<CondDef> getNecessaryConditions() {
return conditions; return necessaryConditions;
} }
@JsonIgnore @JsonIgnore
public List<ConditionAlgo> getConditionAlgos() { public List<ConditionAlgo> getConditionAlgos() {
return asConditionAlgos(getConditions()); return asConditionAlgos(getNecessaryConditions());
} }
@JsonIgnore @JsonIgnore
public List<ConditionAlgo> getStrictConditionAlgos() { public List<ConditionAlgo> getStrictConditionAlgos() {
return asConditionAlgos(getStrictConditions()); return asConditionAlgos(getSufficientConditions());
} }
public void setConditions(final List<CondDef> conditions) { public void setNecessaryConditions(final List<CondDef> necessaryConditions) {
this.conditions = conditions; this.necessaryConditions = necessaryConditions;
} }
public List<ClusteringDef> getClustering() { public List<ClusteringDef> getClustering() {
@ -80,6 +84,14 @@ public class PaceConfig implements Serializable {
this.clustering = clustering; this.clustering = clustering;
} }
public Map<String, TreeNodeDef> getDecisionTree() {
return decisionTree;
}
public void setDecisionTree(Map<String, TreeNodeDef> decisionTree) {
this.decisionTree = decisionTree;
}
public Map<String, List<String>> getBlacklists() { public Map<String, List<String>> getBlacklists() {
return blacklists; return blacklists;
} }

View File

@ -1,15 +1,15 @@
package eu.dnetlib.pace.distance; //package eu.dnetlib.pace.distance;
//
import eu.dnetlib.pace.config.Config; //import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ScoreResult; //import eu.dnetlib.pace.distance.eval.ScoreResult;
import eu.dnetlib.pace.model.Document; //import eu.dnetlib.pace.model.Document;
//
public abstract class AbstractDistance<A> implements Distance<A> { //public abstract class AbstractDistance<A> implements Distance<A> {
//
protected abstract Document toDocument(A a); // protected abstract Document toDocument(A a);
//
@Override // @Override
public ScoreResult between(final A a, final A b, final Config config) { // public boolean between(final A a, final A b, final Config config) {
return new DistanceScorer(config).distance(toDocument(a), toDocument(b)); // return new PairwiseComparison(config).compare(toDocument(a), toDocument(b));
} // }
} //}

View File

@ -1,9 +1,8 @@
package eu.dnetlib.pace.distance; package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ScoreResult;
public interface Distance<A> { public interface Distance<A> {
public ScoreResult between(A a, A b, Config config); public boolean between(A a, A b, Config config);
} }

View File

@ -5,7 +5,7 @@ import eu.dnetlib.pace.model.Field;
import java.util.Map; import java.util.Map;
/** /**
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two * Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two
* objects. * objects.
*/ */
public interface DistanceAlgo { public interface DistanceAlgo {

View File

@ -1,126 +0,0 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.distance.eval.DistanceEval;
import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
import eu.dnetlib.pace.distance.eval.ScoreResult;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
/**
* The distance between two documents is given by the weighted mean of the field distances
*/
public class DistanceScorer {
private static final Log log = LogFactory.getLog(DistanceScorer.class);
private Config config;
public DistanceScorer(final Config config) {
this.config = config;
}
public ScoreResult distance(final Document a, final Document b) {
final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
sr.setStrictConditions(verify(a, b, config.strictConditions()));
sr.setConditions(verify(a, b, config.conditions()));
final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model()));
for (final FieldDef fd : config.model()) {
dMap.updateDistance(fieldDistance(a, b, fd));
}
sr.setDistances(dMap);
return sr;
}
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
final ConditionEvalMap res = new ConditionEvalMap();
for (final ConditionAlgo cd : conditions) {
final ConditionEvalMap map = cd.verify(a, b);
res.mergeFrom(map);
// commented out shortcuts
/*
if (map.anyNegative()) {
return res;
}
*/
//if (strict && (res < 0)) return -1;
//cond += verify;
}
return res;
}
private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) {
final double w = fd.getWeight();
final Field va = getValue(a, fd);
final Field vb = getValue(b, fd);
final DistanceEval de = new DistanceEval(fd, va, vb);
if ((w == 0)) return de; // optimization for 0 weight
else {
if (va.isEmpty() || vb.isEmpty()) {
if (fd.isIgnoreMissing()) {
de.setDistance(-1);
} else {
de.setDistance(w);
}
} else {
if (va.getType().equals(vb.getType())) {
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
} else {
throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
}
}
return de;
}
}
private Field getValue(final Document d, final FieldDef fd) {
final Field v = d.values(fd.getName());
if (fd.getLength() > 0) {
if (v instanceof FieldValueImpl) {
((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
} else if (v instanceof FieldListImpl) {
List<String> strings = ((FieldListImpl) v).stringList();
strings = strings.stream()
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
.collect(Collectors.toList());
((FieldListImpl) v).clear();
((FieldListImpl) v).addAll(strings.stream()
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
.map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
.collect(Collectors.toList()));
}
}
return v;
}
private double sumWeights(final Collection<FieldDef> fields) {
double sum = 0.0;
for (final FieldDef fd : fields) {
sum += fd.getWeight();
}
return sum;
}
}

View File

@ -1,12 +1,12 @@
package eu.dnetlib.pace.distance; //package eu.dnetlib.pace.distance;
//
import eu.dnetlib.pace.model.Document; //import eu.dnetlib.pace.model.Document;
//
public class PaceDocumentDistance extends AbstractDistance<Document> { //public class PaceDocumentDistance extends AbstractDistance<Document> {
//
@Override // @Override
protected Document toDocument(Document a) { // protected Document toDocument(Document a) {
return a; // return a;
} // }
//
} //}

View File

@ -0,0 +1,125 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.tree.support.MatchType;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException;
import eu.dnetlib.pace.util.Reporter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.List;
import java.util.Map;
/**
* The compare between two documents is given by the weighted mean of the field distances
*/
public class PairwiseComparison {
private static final Log log = LogFactory.getLog(PairwiseComparison.class);
private Config config;
public PairwiseComparison(final Config config) {
this.config = config;
}
public boolean compare(final MapDocument a, final MapDocument b) {
//verify sufficientConditions
if (verify(a, b, config.sufficientConditions()).result() > 0)
return true;
//verify necessaryConditions
if (verify(a, b, config.necessaryConditions()).result() < 0)
return false;
//evaluate the decision tree
return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH;
}
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
final ConditionEvalMap res = new ConditionEvalMap();
for (final ConditionAlgo cd : conditions) {
final ConditionEvalMap map = cd.verify(a, b);
res.mergeFrom(map);
// commented out shortcuts
/*
if (map.anyNegative()) {
return res;
}
*/
//if (strict && (res < 0)) return -1;
//cond += verify;
}
return res;
}
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map<String, TreeNodeDef> decisionTree){
String current = "start";
double similarity;
while (MatchType.parse(current)==MatchType.UNDEFINED) {
TreeNodeDef currentNode = decisionTree.get(current);
//throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current);
similarity = currentNode.evaluate(doc1, doc2);
if (similarity == -1) {
current = currentNode.getUndefined();
}
else if (similarity>=currentNode.getThreshold()){
current = currentNode.getPositive();
}
else {
current = currentNode.getNegative();
}
}
return MatchType.parse(current);
}
// private Field getValue(final Document d, final FieldDef fd) {
// final Field v = d.values(fd.getName());
// if (fd.getLength() > 0) {
//
// if (v instanceof FieldValueImpl) {
// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
// } else if (v instanceof FieldListImpl) {
// List<String> strings = ((FieldListImpl) v).stringList();
// strings = strings.stream()
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
// .collect(Collectors.toList());
// ((FieldListImpl) v).clear();
// ((FieldListImpl) v).addAll(strings.stream()
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
// .collect(Collectors.toList()));
// }
// }
//
// return v;
// }
//
// private double sumWeights(final Collection<FieldDef> fields) {
// double sum = 0.0;
// for (final FieldDef fd : fields) {
// sum += fd.getWeight();
// }
// return sum;
// }
}

View File

@ -12,7 +12,7 @@ import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.FieldList;
/** /**
* For the rest of the fields delegate the distance measure to the second string library. * For the rest of the fields delegate the compare measure to the second string library.
*/ */
public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo { public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo {
@ -35,7 +35,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
} }
/** /**
* Instantiates a new second string distance algo. * Instantiates a new second string compare algo.
* *
* @param weight * @param weight
* the weight * the weight
@ -90,7 +90,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) * @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/ */
@Override @Override
public double distance(final Field a, final Field b) { public double distance(final Field a, final Field b) {

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.DistanceScorer;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;

View File

@ -7,7 +7,7 @@ import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map; import java.util.Map;
/** /**
* Compared distance between two titles, ignoring version numbers. Suitable for Software entities. * Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/ */
@DistanceClass("LevensteinTitleIgnoreVersion") @DistanceClass("LevensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo { public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo {

View File

@ -7,7 +7,7 @@ import eu.dnetlib.pace.model.Field;
import java.util.Map; import java.util.Map;
/** /**
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
* NullDistanceAlgo. * NullDistanceAlgo.
*/ */
@DistanceClass("Null") @DistanceClass("Null")

View File

@ -40,7 +40,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/ */
@Override @Override
public double getWeight() { public double getWeight() {
@ -50,7 +50,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/ */
@Override @Override
protected double normalize(final double d) { protected double normalize(final double d) {

View File

@ -40,7 +40,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/ */
@Override @Override
public double getWeight() { public double getWeight() {
@ -50,7 +50,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/ */
@Override @Override
protected double normalize(final double d) { protected double normalize(final double d) {

View File

@ -12,12 +12,12 @@ import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.FieldList;
/** /**
* For the rest of the fields delegate the distance measure to the second string library. * For the rest of the fields delegate the compare measure to the second string library.
*/ */
public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo { public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo {
/** /**
* Instantiates a new sorted second string distance algo. * Instantiates a new sorted second string compare algo.
* *
* @param weight * @param weight
* the weight * the weight
@ -35,7 +35,7 @@ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanc
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field) * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
*/ */
@Override @Override
protected List<String> toList(final Field list) { protected List<String> toList(final Field list) {

View File

@ -66,7 +66,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/ */
@Override @Override
public double distance(final Field a, final Field b) { public double distance(final Field a, final Field b) {
@ -79,7 +79,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/ */
@Override @Override
public double getWeight() { public double getWeight() {
@ -89,7 +89,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
/* /*
* (non-Javadoc) * (non-Javadoc)
* *
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/ */
@Override @Override
protected double normalize(final double d) { protected double normalize(final double d) {

View File

@ -1,32 +0,0 @@
package eu.dnetlib.pace.distance.eval;
import java.util.HashMap;
/**
* Created by claudio on 10/03/16.
*/
public class DistanceEvalMap extends HashMap<String, DistanceEval> {
private double sumWeights;
private double sumDistances = 0.0;
public DistanceEvalMap(final double sumWeights) {
this.sumWeights = sumWeights;
}
public void updateDistance(final DistanceEval d) {
put(d.getFieldDef().getName(), d);
if (d.getDistance() >= 0) {
sumDistances += d.getDistance();
} else {
sumWeights -= d.getFieldDef().getWeight();
}
}
public double distance() {
return sumWeights == 0 ? 0 : sumDistances / sumWeights;
}
}

View File

@ -1,62 +0,0 @@
package eu.dnetlib.pace.distance.eval;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
/**
* Created by claudio on 09/03/16.
*/
public class ScoreResult implements Serializable {
private ConditionEvalMap strictConditions;
private ConditionEvalMap conditions;
private DistanceEvalMap distances;
public double getScore() {
if (getStrictConditions().result() > 0) return 1.0;
// if (getStrictConditions().result() < 0) return 0.0;
if (getConditions().result() < 0) return 0.0;
return getDistances().distance();
}
public ConditionEvalMap getStrictConditions() {
return strictConditions;
}
public void setStrictConditions(final ConditionEvalMap strictConditions) {
this.strictConditions = strictConditions;
}
public ConditionEvalMap getConditions() {
return conditions;
}
public void setConditions(final ConditionEvalMap conditions) {
this.conditions = conditions;
}
public DistanceEvalMap getDistances() {
return distances;
}
public void setDistances(final DistanceEvalMap distances) {
this.distances = distances;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
}
}
}

View File

@ -14,25 +14,29 @@ import java.util.List;
import java.util.Map; import java.util.Map;
/** /**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm. * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
*/ */
public class FieldDef implements Serializable { public class FieldDef implements Serializable {
public final static String PATH_SEPARATOR = "/"; public final static String PATH_SEPARATOR = "/";
private String algo;
private String name; private String name;
private String path; private String path;
private boolean ignoreMissing;
private Type type; private Type type;
private boolean overrideMatch; private boolean ignoreMissing;
private double weight; public boolean isIgnoreMissing() {
return ignoreMissing;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
private boolean overrideMatch;
/** /**
* Sets maximum size for the repeatable fields in the model. -1 for unbounded size. * Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
@ -74,20 +78,6 @@ public class FieldDef implements Serializable {
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath())); return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
} }
public DistanceAlgo distanceAlgo() {
if (params == null) {
params = new HashMap<>();
}
params.put("weight", getWeight());
return PaceConfig.resolver.getDistanceAlgo(getAlgo(), params);
}
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public Type getType() { public Type getType() {
return type; return type;
} }
@ -104,23 +94,6 @@ public class FieldDef implements Serializable {
this.overrideMatch = overrideMatch; this.overrideMatch = overrideMatch;
} }
public double getWeight() {
return weight;
}
public void setWeight(final double weight) {
this.weight = weight;
}
public String getAlgo() {
return algo;
}
public void setAlgo(final String algo) {
this.algo = algo;
}
public int getSize() { public int getSize() {
return size; return size;
} }
@ -153,10 +126,6 @@ public class FieldDef implements Serializable {
this.path = path; this.path = path;
} }
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
@Override @Override
public String toString() { public String toString() {
return new Gson().toJson(this); return new Gson().toJson(this);

View File

@ -0,0 +1,42 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("alwaysMatch")
public class AlwaysMatch extends AbstractComparator {
public AlwaysMatch(final Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return 1.0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("exactMatch")
public class ExactMatch extends AbstractComparator {
public ExactMatch(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.io.Serializable;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinkler")
public class JaroWinkler extends AbstractComparator {
public JaroWinkler(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,78 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractComparator {
private Map<String, Number> params;
public JaroWinklerNormalizedName(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
if (sameCity(cities1,cities2)) {
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
}
return 0.0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinklerTitle")
public class JaroWinklerTitle extends AbstractComparator {
public JaroWinklerTitle(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinklerTitle(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
boolean check = checkNumbers(ca, cb);
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinkler")
public class Level2JaroWinkler extends AbstractComparator {
public Level2JaroWinkler(Map<String, Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinkler(double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}
protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends AbstractComparator {
public Level2JaroWinklerTitle(Map<String,Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinklerTitle(final double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
final String ca = cleanup(a);
final String cb = cleanup(b);
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return ssalgo.score(cca, ccb);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2Levenstein")
public class Level2Levenstein extends AbstractComparator {
public Level2Levenstein(Map<String,Number> params){
super(params, new com.wcohen.ss.Level2Levenstein());
}
public Level2Levenstein(double w) {
super(w, new com.wcohen.ss.Level2Levenstein());
}
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("levenstein")
public class Levenstein extends AbstractComparator {
public Levenstein(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public Levenstein(double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected Levenstein(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,59 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Map;
@ComparatorClass("levensteinTitle")
public class LevensteinTitle extends AbstractComparator {
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
public LevensteinTitle(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitle(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
final String ca = cleanup(a);
final String cb = cleanup(b);
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,60 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/
@ComparatorClass("levensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends AbstractComparator {
public LevensteinTitleIgnoreVersion(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitleIgnoreVersion(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("mustBeDifferent")
public class MustBeDifferent extends AbstractComparator {
public MustBeDifferent(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public MustBeDifferent(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return !a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
* NullDistanceAlgo.
*/
@ComparatorClass("null")
public class NullDistanceAlgo implements Comparator {
public NullDistanceAlgo(Map<String, Number> params){
}
@Override
public double compare(Field a, Field b) {
return 0;
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ComparatorClass("sortedJaroWinkler")
public class SortedJaroWinkler extends AbstractSortedComparator {
public SortedJaroWinkler(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
*/
public SortedJaroWinkler(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ComparatorClass("sortedLevel2JaroWinkler")
public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
*/
public SortedLevel2JaroWinkler(final double weight) {
super(weight, new com.wcohen.ss.Level2JaroWinkler());
}
public SortedLevel2JaroWinkler(final Map<String, Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,99 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang.StringUtils;
import java.util.Map;
/**
* The Class SubStringLevenstein.
*/
@ComparatorClass("subStringLevenstein")
public class SubStringLevenstein extends AbstractComparator {
/** The limit. */
protected int limit;
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
*/
public SubStringLevenstein(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
public SubStringLevenstein(Map<String, Number> params){
super(params, new com.wcohen.ss.Levenstein());
this.limit = params.get("limit").intValue();
}
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
* @param limit
* the limit
*/
public SubStringLevenstein(final double w, final int limit) {
super(w, new com.wcohen.ss.Levenstein());
this.limit = limit;
}
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
* @param limit
* the limit
* @param ssalgo
* the ssalgo
*/
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
this.limit = limit;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
public double compare(final Field a, final Field b) {
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,60 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang.StringUtils;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
@ComparatorClass("urlMatcher")
public class UrlMatcher extends Levenstein {
private Map<String, Number> params;
public UrlMatcher(Map<String, Number> params){
super(params);
this.params = params;
}
public UrlMatcher(double weight, Map<String, Number> params) {
super(weight);
this.params = params;
}
public void setParams(Map<String, Number> params) {
this.params = params;
}
@Override
public double compare(Field a, Field b) {
final URL urlA = asUrl(getFirstValue(a));
final URL urlB = asUrl(getFirstValue(b));
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
return 0.0;
}
Double hostW = params.get("host").doubleValue();
Double pathW = params.get("path").doubleValue();
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
return hostW * 0.5;
}
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
}
private URL asUrl(final String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
}

View File

@ -0,0 +1,110 @@
package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import java.util.List;
import java.util.Map;
public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator {
/** The ssalgo. */
protected AbstractStringDistance ssalgo;
/** The weight. */
protected double weight = 0.0;
private Map<String, Number> params;
protected AbstractComparator(Map<String, Number> params, final AbstractStringDistance ssalgo){
this.params = params;
this.weight = 1.0;
this.ssalgo = ssalgo;
}
/**
* Instantiates a new second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo;
this.weight = weight;
}
protected AbstractComparator(final AbstractStringDistance ssalgo){
this.ssalgo = ssalgo;
}
/**
* Normalize.
*
* @param d
* the d
* @return the double
*/
protected abstract double normalize(double d);
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
public double distance(final String a, final String b) {
if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing
}
double score = ssalgo.score(a, b);
return normalize(score);
}
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
protected double distance(final List<String> a, final List<String> b) {
return distance(concat(a), concat(b));
}
@Override
public double compare(final Field a, final Field b) {
if (a.isEmpty() || b.isEmpty())
return -1;
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
/**
* To list.
*
* @param list
* the list
* @return the list
*/
protected List<String> toList(final Field list) {
return ((FieldList) list).stringList();
}
public double getWeight(){
return this.weight;
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public abstract class AbstractSortedComparator extends AbstractComparator {
/**
* Instantiates a new sorted second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected AbstractSortedComparator(final Map<String, Number> params, final AbstractStringDistance ssalgo){
super(params.get("weight").doubleValue(), ssalgo);
}
@Override
protected List<String> toList(final Field list) {
FieldList fl = (FieldList) list;
List<String> values = Lists.newArrayList(fl.stringList());
Collections.sort(values);
return values;
}
}

View File

@ -0,0 +1,22 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
public enum AggType {
WEIGHTED_MEAN,
AVG,
SUM,
MAX,
MIN;
public static AggType getEnum(String value) {
try {
return AggType.valueOf(value);
}
catch (IllegalArgumentException e) {
throw new PaceException("Undefined aggregation type", e);
}
}
}

View File

@ -0,0 +1,9 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.model.Field;
public interface Comparator {
public double compare(Field a, Field b);
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.pace.tree.support;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ComparatorClass {
public String value();
}

View File

@ -1,4 +1,4 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.ObjectMapper;
@ -14,14 +14,25 @@ public class FieldConf implements Serializable {
private double weight = 1.0; //weight for the field (to be used in the aggregation) private double weight = 1.0; //weight for the field (to be used in the aggregation)
private Map<String,Number> params; //parameters private Map<String,Number> params; //parameters
private boolean ignoreMissing;
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
public FieldConf() { public FieldConf() {
} }
public FieldConf(String field, String comparator, double weight, Map<String, Number> params) { public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean ignoreMissing) {
this.field = field; this.field = field;
this.comparator = comparator; this.comparator = comparator;
this.weight = weight; this.weight = weight;
this.params = params; this.params = params;
this.ignoreMissing = ignoreMissing;
} }
public String getField() { public String getField() {

View File

@ -0,0 +1,18 @@
package eu.dnetlib.pace.tree.support;
public enum MatchType {
MATCH,
NO_MATCH,
UNDEFINED;
public static MatchType parse(String value) {
try {
return MatchType.valueOf(value);
}
catch (IllegalArgumentException e) {
return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
}
}
}

View File

@ -0,0 +1,157 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
public class TreeNodeDef implements Serializable {
private List<FieldConf> fields;
private AggType aggregation;
private double threshold;
private String positive;
private String negative;
private String undefined;
boolean ignoreMissing;
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) {
this.fields = fields;
this.aggregation = aggregation;
this.threshold = threshold;
this.positive = positive;
this.negative = negative;
this.undefined = undefined;
this.ignoreMissing = ignoreMissing;
}
public TreeNodeDef() {
}
public double evaluate(MapDocument doc1, MapDocument doc2) {
DescriptiveStatistics stats = new DescriptiveStatistics();
double sumWeights = 0.0; //for the weighted mean
int missCount = 0; //counter for the number of misses
for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight();
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
if (result >= 0.0) { //if the field is not missing
stats.addValue(weight * result);
sumWeights += weight; //sum weights, to be used in case of weighted mean
}
else { //if the field is missing
missCount += 1;
if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored
stats.addValue(weight * 0);
sumWeights += weight;
}
}
}
//global ignoremissing (if one of the field is missing, return undefined)
if (!ignoreMissing && missCount>0) {
return -1;
}
switch (aggregation){
case AVG:
return stats.getMean();
case SUM:
return stats.getSum();
case MAX:
return stats.getMax();
case MIN:
return stats.getMin();
case WEIGHTED_MEAN:
return stats.getSum()/sumWeights;
default:
return 0.0;
}
}
private Comparator comparator(final FieldConf field){
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
}
public List<FieldConf> getFields() {
return fields;
}
public void setFields(List<FieldConf> fields) {
this.fields = fields;
}
public AggType getAggregation() {
return aggregation;
}
public void setAggregation(AggType aggregation) {
this.aggregation = aggregation;
}
public double getThreshold() {
return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public String getPositive() {
return positive;
}
public void setPositive(String positive) {
this.positive = positive;
}
public String getNegative() {
return negative;
}
public void setNegative(String negative) {
this.negative = negative;
}
public String getUndefined() {
return undefined;
}
public void setUndefined(String undefined) {
this.undefined = undefined;
}
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -4,8 +4,8 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig; import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.distance.PaceDocumentDistance; //import eu.dnetlib.pace.distance.PaceDocumentDistance;
import eu.dnetlib.pace.distance.eval.ScoreResult; import eu.dnetlib.pace.distance.PairwiseComparison;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentComparator; import eu.dnetlib.pace.model.MapDocumentComparator;
@ -116,7 +116,7 @@ public class BlockProcessor {
private void process(final Queue<MapDocument> queue, final Reporter context) { private void process(final Queue<MapDocument> queue, final Reporter context) {
final PaceDocumentDistance algo = new PaceDocumentDistance(); // final PaceDocumentDistance algo = new PaceDocumentDistance();
while (!queue.isEmpty()) { while (!queue.isEmpty()) {
@ -150,21 +150,23 @@ public class BlockProcessor {
if (!idCurr.equals(idPivot) && (fieldCurr != null)) { if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
final ScoreResult sr = similarity(algo, pivot, curr); final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf);
// log.info(sr.toString()+"SCORE "+ sr.getScore());
emitOutput(sr, idPivot, idCurr, context); emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context);
i++;
// final ScoreResult sr = similarity(algo, pivot, curr);
//// log.info(sr.toString()+"SCORE "+ sr.getScore());
// emitOutput(sr, idPivot, idCurr, context);
// i++;
} }
} }
} }
} }
} }
private void emitOutput(final ScoreResult sr, final String idPivot, final String idCurr, final Reporter context) { private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
final double d = sr.getScore();
if (d >= dedupConf.getWf().getThreshold()) {
if (result) {
writeSimilarity(context, idPivot, idCurr); writeSimilarity(context, idPivot, idCurr);
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
} else { } else {
@ -172,15 +174,6 @@ public class BlockProcessor {
} }
} }
private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
try {
return algo.between(a, b, dedupConf);
} catch(Throwable e) {
log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
throw new IllegalArgumentException(e);
}
}
private boolean mustSkip(final String idPivot) { private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
} }

View File

@ -1390,7 +1390,7 @@ public class DiffPatchMatch {
} }
/** /**
* Compute the Levenshtein distance; the number of inserted, deleted or * Compute the Levenshtein compare; the number of inserted, deleted or
* substituted characters. * substituted characters.
* @param diffs List of Diff objects. * @param diffs List of Diff objects.
* @return Number of changes. * @return Number of changes.
@ -1655,7 +1655,7 @@ public class DiffPatchMatch {
score_threshold = score; score_threshold = score;
best_loc = j - 1; best_loc = j - 1;
if (best_loc > loc) { if (best_loc > loc) {
// When passing loc, don't exceed our current distance from loc. // When passing loc, don't exceed our current compare from loc.
start = Math.max(1, 2 * loc - best_loc); start = Math.max(1, 2 * loc - best_loc);
} else { } else {
// Already passed loc, downhill from here on in. // Already passed loc, downhill from here on in.

View File

@ -7,6 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass;
import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.reflections.Reflections; import org.reflections.Reflections;
import java.io.Serializable; import java.io.Serializable;
@ -19,11 +21,13 @@ public class PaceResolver implements Serializable {
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition"); public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition");
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.distance.algo"); public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo");
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
private final Map<String, Class<ClusteringFunction>> clusteringFunctions; private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<ConditionAlgo>> conditionAlgos; private final Map<String, Class<ConditionAlgo>> conditionAlgos;
private final Map<String, Class<DistanceAlgo>> distanceAlgos; private final Map<String, Class<DistanceAlgo>> distanceAlgos;
private final Map<String, Class<Comparator>> comparators;
public PaceResolver() { public PaceResolver() {
@ -38,6 +42,10 @@ public class PaceResolver implements Serializable {
this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream() this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream()
.filter(DistanceAlgo.class::isAssignableFrom) .filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl)); .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
.filter(Comparator.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
} }
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException { public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
@ -64,4 +72,12 @@ public class PaceResolver implements Serializable {
} }
} }
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
try {
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
throw new PaceException(name + " not found ", e);
}
}
} }

View File

@ -16,9 +16,9 @@
"pace" : { "pace" : {
"clustering" : [ "clustering" : [
], ],
"strictConditions" : [ "sufficientConditions" : [
], ],
"conditions" : [ "necessaryConditions" : [
], ],
"model" : [ "model" : [
], ],

View File

@ -16,10 +16,10 @@
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
], ],
"strictConditions" : [ "sufficientConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] } { "name" : "exactMatch", "fields" : [ "gridid" ] }
], ],
"conditions" : [ "necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] }, { "name" : "exactMatch", "fields" : [ "country" ] },
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
], ],