forked from D-Net/dnet-hadoop
implementation of the decision tree. It takes place of the distance algos, necessaryConditions and sufficientConditions are still there. The model contains only path, type and name of the field. ignoreMissing is still in the model because it is used by the conditions.
This commit is contained in:
parent
f2136e1024
commit
a5c5d2f01b
|
@ -9,7 +9,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Abstract conditions needs a list of field names.
|
||||
* Abstract necessaryConditions needs a list of field names.
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
|
|
|
@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Document;
|
|||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Allows to express general conditions to be satisfied or not between two Documents.
|
||||
* Allows to express general necessaryConditions to be satisfied or not between two Documents.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
|||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
|
||||
/**
|
||||
* Interface for PACE configuration bean.
|
||||
|
@ -21,6 +22,9 @@ public interface Config {
|
|||
*/
|
||||
public List<FieldDef> model();
|
||||
|
||||
|
||||
public Map<String, TreeNodeDef> decisionTree();
|
||||
|
||||
/**
|
||||
* Field configuration definitions.
|
||||
*
|
||||
|
@ -31,16 +35,16 @@ public interface Config {
|
|||
/**
|
||||
* Strict Pre-Condition definitions.
|
||||
*
|
||||
* @return the list of conditions
|
||||
* @return the list of necessaryConditions
|
||||
*/
|
||||
public List<ConditionAlgo> strictConditions();
|
||||
public List<ConditionAlgo> sufficientConditions();
|
||||
|
||||
/**
|
||||
* Pre-Condition definitions.
|
||||
*
|
||||
* @return the list of conditions
|
||||
* @return the list of necessaryConditions
|
||||
*/
|
||||
public List<ConditionAlgo> conditions();
|
||||
public List<ConditionAlgo> necessaryConditions();
|
||||
|
||||
/**
|
||||
* Clusterings.
|
||||
|
|
|
@ -8,6 +8,7 @@ import java.util.Map;
|
|||
import java.util.Map.Entry;
|
||||
import java.util.function.BiFunction;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.antlr.stringtemplate.StringTemplate;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -114,6 +115,11 @@ public class DedupConfig implements Config, Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, TreeNodeDef> decisionTree(){
|
||||
return getPace().getDecisionTree();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<FieldDef> model() {
|
||||
return getPace().getModel();
|
||||
|
@ -125,12 +131,12 @@ public class DedupConfig implements Config, Serializable {
|
|||
}
|
||||
|
||||
@Override
|
||||
public List<ConditionAlgo> strictConditions() {
|
||||
public List<ConditionAlgo> sufficientConditions() {
|
||||
return getPace().getStrictConditionAlgos();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ConditionAlgo> conditions() {
|
||||
public List<ConditionAlgo> necessaryConditions() {
|
||||
return getPace().getConditionAlgos();
|
||||
}
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo;
|
|||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.CondDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
|
@ -18,9 +19,12 @@ import java.util.stream.Collectors;
|
|||
public class PaceConfig implements Serializable {
|
||||
|
||||
private List<FieldDef> model;
|
||||
private List<CondDef> strictConditions;
|
||||
private List<CondDef> conditions;
|
||||
|
||||
private List<CondDef> sufficientConditions;
|
||||
private List<CondDef> necessaryConditions;
|
||||
private List<ClusteringDef> clustering;
|
||||
private Map<String, TreeNodeDef> decisionTree;
|
||||
|
||||
private Map<String, List<String>> blacklists;
|
||||
|
||||
@JsonIgnore
|
||||
|
@ -46,30 +50,30 @@ public class PaceConfig implements Serializable {
|
|||
this.model = model;
|
||||
}
|
||||
|
||||
public List<CondDef> getStrictConditions() {
|
||||
return strictConditions;
|
||||
public List<CondDef> getSufficientConditions() {
|
||||
return sufficientConditions;
|
||||
}
|
||||
|
||||
public void setStrictConditions(final List<CondDef> strictConditions) {
|
||||
this.strictConditions = strictConditions;
|
||||
public void setSufficientConditions(final List<CondDef> sufficientConditions) {
|
||||
this.sufficientConditions = sufficientConditions;
|
||||
}
|
||||
|
||||
public List<CondDef> getConditions() {
|
||||
return conditions;
|
||||
public List<CondDef> getNecessaryConditions() {
|
||||
return necessaryConditions;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public List<ConditionAlgo> getConditionAlgos() {
|
||||
return asConditionAlgos(getConditions());
|
||||
return asConditionAlgos(getNecessaryConditions());
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public List<ConditionAlgo> getStrictConditionAlgos() {
|
||||
return asConditionAlgos(getStrictConditions());
|
||||
return asConditionAlgos(getSufficientConditions());
|
||||
}
|
||||
|
||||
public void setConditions(final List<CondDef> conditions) {
|
||||
this.conditions = conditions;
|
||||
public void setNecessaryConditions(final List<CondDef> necessaryConditions) {
|
||||
this.necessaryConditions = necessaryConditions;
|
||||
}
|
||||
|
||||
public List<ClusteringDef> getClustering() {
|
||||
|
@ -80,6 +84,14 @@ public class PaceConfig implements Serializable {
|
|||
this.clustering = clustering;
|
||||
}
|
||||
|
||||
public Map<String, TreeNodeDef> getDecisionTree() {
|
||||
return decisionTree;
|
||||
}
|
||||
|
||||
public void setDecisionTree(Map<String, TreeNodeDef> decisionTree) {
|
||||
this.decisionTree = decisionTree;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getBlacklists() {
|
||||
return blacklists;
|
||||
}
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
|
||||
public abstract class AbstractDistance<A> implements Distance<A> {
|
||||
|
||||
protected abstract Document toDocument(A a);
|
||||
|
||||
@Override
|
||||
public ScoreResult between(final A a, final A b, final Config config) {
|
||||
return new DistanceScorer(config).distance(toDocument(a), toDocument(b));
|
||||
}
|
||||
}
|
||||
//package eu.dnetlib.pace.distance;
|
||||
//
|
||||
//import eu.dnetlib.pace.config.Config;
|
||||
//import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
//import eu.dnetlib.pace.model.Document;
|
||||
//
|
||||
//public abstract class AbstractDistance<A> implements Distance<A> {
|
||||
//
|
||||
// protected abstract Document toDocument(A a);
|
||||
//
|
||||
// @Override
|
||||
// public boolean between(final A a, final A b, final Config config) {
|
||||
// return new PairwiseComparison(config).compare(toDocument(a), toDocument(b));
|
||||
// }
|
||||
//}
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
|
||||
public interface Distance<A> {
|
||||
|
||||
public ScoreResult between(A a, A b, Config config);
|
||||
public boolean between(A a, A b, Config config);
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
|
||||
* Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two
|
||||
* objects.
|
||||
*/
|
||||
public interface DistanceAlgo {
|
||||
|
|
|
@ -1,126 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.distance.eval.DistanceEval;
|
||||
import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* The distance between two documents is given by the weighted mean of the field distances
|
||||
*/
|
||||
public class DistanceScorer {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DistanceScorer.class);
|
||||
|
||||
private Config config;
|
||||
|
||||
public DistanceScorer(final Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public ScoreResult distance(final Document a, final Document b) {
|
||||
final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
|
||||
|
||||
sr.setStrictConditions(verify(a, b, config.strictConditions()));
|
||||
sr.setConditions(verify(a, b, config.conditions()));
|
||||
|
||||
final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model()));
|
||||
|
||||
for (final FieldDef fd : config.model()) {
|
||||
|
||||
dMap.updateDistance(fieldDistance(a, b, fd));
|
||||
}
|
||||
sr.setDistances(dMap);
|
||||
return sr;
|
||||
}
|
||||
|
||||
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
|
||||
final ConditionEvalMap res = new ConditionEvalMap();
|
||||
|
||||
for (final ConditionAlgo cd : conditions) {
|
||||
final ConditionEvalMap map = cd.verify(a, b);
|
||||
res.mergeFrom(map);
|
||||
|
||||
// commented out shortcuts
|
||||
/*
|
||||
if (map.anyNegative()) {
|
||||
return res;
|
||||
}
|
||||
*/
|
||||
|
||||
//if (strict && (res < 0)) return -1;
|
||||
//cond += verify;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) {
|
||||
|
||||
final double w = fd.getWeight();
|
||||
final Field va = getValue(a, fd);
|
||||
final Field vb = getValue(b, fd);
|
||||
|
||||
final DistanceEval de = new DistanceEval(fd, va, vb);
|
||||
if ((w == 0)) return de; // optimization for 0 weight
|
||||
else {
|
||||
if (va.isEmpty() || vb.isEmpty()) {
|
||||
if (fd.isIgnoreMissing()) {
|
||||
de.setDistance(-1);
|
||||
} else {
|
||||
de.setDistance(w);
|
||||
}
|
||||
} else {
|
||||
if (va.getType().equals(vb.getType())) {
|
||||
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
|
||||
} else {
|
||||
throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
||||
}
|
||||
}
|
||||
return de;
|
||||
}
|
||||
}
|
||||
|
||||
private Field getValue(final Document d, final FieldDef fd) {
|
||||
final Field v = d.values(fd.getName());
|
||||
if (fd.getLength() > 0) {
|
||||
|
||||
if (v instanceof FieldValueImpl) {
|
||||
((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
|
||||
} else if (v instanceof FieldListImpl) {
|
||||
List<String> strings = ((FieldListImpl) v).stringList();
|
||||
strings = strings.stream()
|
||||
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
.collect(Collectors.toList());
|
||||
((FieldListImpl) v).clear();
|
||||
((FieldListImpl) v).addAll(strings.stream()
|
||||
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
.map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
private double sumWeights(final Collection<FieldDef> fields) {
|
||||
double sum = 0.0;
|
||||
for (final FieldDef fd : fields) {
|
||||
sum += fd.getWeight();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,12 +1,12 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
|
||||
public class PaceDocumentDistance extends AbstractDistance<Document> {
|
||||
|
||||
@Override
|
||||
protected Document toDocument(Document a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
}
|
||||
//package eu.dnetlib.pace.distance;
|
||||
//
|
||||
//import eu.dnetlib.pace.model.Document;
|
||||
//
|
||||
//public class PaceDocumentDistance extends AbstractDistance<Document> {
|
||||
//
|
||||
// @Override
|
||||
// protected Document toDocument(Document a) {
|
||||
// return a;
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import eu.dnetlib.pace.tree.support.MatchType;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import eu.dnetlib.pace.util.Reporter;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The compare between two documents is given by the weighted mean of the field distances
|
||||
*/
|
||||
public class PairwiseComparison {
|
||||
|
||||
private static final Log log = LogFactory.getLog(PairwiseComparison.class);
|
||||
|
||||
private Config config;
|
||||
|
||||
public PairwiseComparison(final Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public boolean compare(final MapDocument a, final MapDocument b) {
|
||||
|
||||
//verify sufficientConditions
|
||||
if (verify(a, b, config.sufficientConditions()).result() > 0)
|
||||
return true;
|
||||
|
||||
//verify necessaryConditions
|
||||
if (verify(a, b, config.necessaryConditions()).result() < 0)
|
||||
return false;
|
||||
|
||||
//evaluate the decision tree
|
||||
return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH;
|
||||
}
|
||||
|
||||
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
|
||||
final ConditionEvalMap res = new ConditionEvalMap();
|
||||
|
||||
for (final ConditionAlgo cd : conditions) {
|
||||
final ConditionEvalMap map = cd.verify(a, b);
|
||||
res.mergeFrom(map);
|
||||
|
||||
// commented out shortcuts
|
||||
/*
|
||||
if (map.anyNegative()) {
|
||||
return res;
|
||||
}
|
||||
*/
|
||||
|
||||
//if (strict && (res < 0)) return -1;
|
||||
//cond += verify;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map<String, TreeNodeDef> decisionTree){
|
||||
|
||||
String current = "start";
|
||||
double similarity;
|
||||
|
||||
while (MatchType.parse(current)==MatchType.UNDEFINED) {
|
||||
|
||||
TreeNodeDef currentNode = decisionTree.get(current);
|
||||
//throw an exception if the node doesn't exist
|
||||
if (currentNode == null)
|
||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
||||
|
||||
similarity = currentNode.evaluate(doc1, doc2);
|
||||
|
||||
if (similarity == -1) {
|
||||
current = currentNode.getUndefined();
|
||||
}
|
||||
else if (similarity>=currentNode.getThreshold()){
|
||||
current = currentNode.getPositive();
|
||||
}
|
||||
else {
|
||||
current = currentNode.getNegative();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return MatchType.parse(current);
|
||||
}
|
||||
|
||||
// private Field getValue(final Document d, final FieldDef fd) {
|
||||
// final Field v = d.values(fd.getName());
|
||||
// if (fd.getLength() > 0) {
|
||||
//
|
||||
// if (v instanceof FieldValueImpl) {
|
||||
// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
|
||||
// } else if (v instanceof FieldListImpl) {
|
||||
// List<String> strings = ((FieldListImpl) v).stringList();
|
||||
// strings = strings.stream()
|
||||
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
// .collect(Collectors.toList());
|
||||
// ((FieldListImpl) v).clear();
|
||||
// ((FieldListImpl) v).addAll(strings.stream()
|
||||
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
|
||||
// .collect(Collectors.toList()));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return v;
|
||||
// }
|
||||
//
|
||||
// private double sumWeights(final Collection<FieldDef> fields) {
|
||||
// double sum = 0.0;
|
||||
// for (final FieldDef fd : fields) {
|
||||
// sum += fd.getWeight();
|
||||
// }
|
||||
// return sum;
|
||||
// }
|
||||
|
||||
}
|
|
@ -12,7 +12,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
/**
|
||||
* For the rest of the fields delegate the distance measure to the second string library.
|
||||
* For the rest of the fields delegate the compare measure to the second string library.
|
||||
*/
|
||||
public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo {
|
||||
|
||||
|
@ -35,7 +35,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new second string distance algo.
|
||||
* Instantiates a new second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
|
@ -90,7 +90,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b) {
|
||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.distance.algo;
|
|||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.DistanceScorer;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
|
|
@ -7,7 +7,7 @@ import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Compared distance between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
*/
|
||||
@DistanceClass("LevensteinTitleIgnoreVersion")
|
||||
public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo {
|
||||
|
|
|
@ -7,7 +7,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
|
||||
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
|
||||
* NullDistanceAlgo.
|
||||
*/
|
||||
@DistanceClass("Null")
|
||||
|
|
|
@ -40,7 +40,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
|
@ -50,7 +50,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
|
|
|
@ -40,7 +40,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
|
@ -50,7 +50,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
|
|
|
@ -12,12 +12,12 @@ import eu.dnetlib.pace.model.Field;
|
|||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
/**
|
||||
* For the rest of the fields delegate the distance measure to the second string library.
|
||||
* For the rest of the fields delegate the compare measure to the second string library.
|
||||
*/
|
||||
public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted second string distance algo.
|
||||
* Instantiates a new sorted second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
|
@ -35,7 +35,7 @@ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanc
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
protected List<String> toList(final Field list) {
|
||||
|
|
|
@ -66,7 +66,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b) {
|
||||
|
@ -79,7 +79,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
|
@ -89,7 +89,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
|
|
|
@ -1,32 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Created by claudio on 10/03/16.
|
||||
*/
|
||||
public class DistanceEvalMap extends HashMap<String, DistanceEval> {
|
||||
|
||||
private double sumWeights;
|
||||
|
||||
private double sumDistances = 0.0;
|
||||
|
||||
public DistanceEvalMap(final double sumWeights) {
|
||||
this.sumWeights = sumWeights;
|
||||
}
|
||||
|
||||
public void updateDistance(final DistanceEval d) {
|
||||
|
||||
put(d.getFieldDef().getName(), d);
|
||||
if (d.getDistance() >= 0) {
|
||||
sumDistances += d.getDistance();
|
||||
} else {
|
||||
sumWeights -= d.getFieldDef().getWeight();
|
||||
}
|
||||
}
|
||||
|
||||
public double distance() {
|
||||
return sumWeights == 0 ? 0 : sumDistances / sumWeights;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,62 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Created by claudio on 09/03/16.
|
||||
*/
|
||||
public class ScoreResult implements Serializable {
|
||||
|
||||
private ConditionEvalMap strictConditions;
|
||||
|
||||
private ConditionEvalMap conditions;
|
||||
|
||||
private DistanceEvalMap distances;
|
||||
|
||||
public double getScore() {
|
||||
|
||||
if (getStrictConditions().result() > 0) return 1.0;
|
||||
// if (getStrictConditions().result() < 0) return 0.0;
|
||||
if (getConditions().result() < 0) return 0.0;
|
||||
|
||||
return getDistances().distance();
|
||||
}
|
||||
|
||||
|
||||
public ConditionEvalMap getStrictConditions() {
|
||||
return strictConditions;
|
||||
}
|
||||
|
||||
public void setStrictConditions(final ConditionEvalMap strictConditions) {
|
||||
this.strictConditions = strictConditions;
|
||||
}
|
||||
|
||||
public ConditionEvalMap getConditions() {
|
||||
return conditions;
|
||||
}
|
||||
|
||||
public void setConditions(final ConditionEvalMap conditions) {
|
||||
this.conditions = conditions;
|
||||
}
|
||||
|
||||
public DistanceEvalMap getDistances() {
|
||||
return distances;
|
||||
}
|
||||
|
||||
public void setDistances(final DistanceEvalMap distances) {
|
||||
this.distances = distances;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,25 +14,29 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
|
||||
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
|
||||
*/
|
||||
public class FieldDef implements Serializable {
|
||||
|
||||
public final static String PATH_SEPARATOR = "/";
|
||||
|
||||
private String algo;
|
||||
|
||||
private String name;
|
||||
|
||||
private String path;
|
||||
|
||||
private boolean ignoreMissing;
|
||||
|
||||
private Type type;
|
||||
|
||||
private boolean overrideMatch;
|
||||
private boolean ignoreMissing;
|
||||
|
||||
private double weight;
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
private boolean overrideMatch;
|
||||
|
||||
/**
|
||||
* Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
|
||||
|
@ -74,20 +78,6 @@ public class FieldDef implements Serializable {
|
|||
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
|
||||
}
|
||||
|
||||
public DistanceAlgo distanceAlgo() {
|
||||
|
||||
if (params == null) {
|
||||
params = new HashMap<>();
|
||||
}
|
||||
|
||||
params.put("weight", getWeight());
|
||||
return PaceConfig.resolver.getDistanceAlgo(getAlgo(), params);
|
||||
}
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public Type getType() {
|
||||
return type;
|
||||
}
|
||||
|
@ -104,23 +94,6 @@ public class FieldDef implements Serializable {
|
|||
this.overrideMatch = overrideMatch;
|
||||
}
|
||||
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void setWeight(final double weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public String getAlgo() {
|
||||
return algo;
|
||||
}
|
||||
|
||||
public void setAlgo(final String algo) {
|
||||
this.algo = algo;
|
||||
}
|
||||
|
||||
|
||||
public int getSize() {
|
||||
return size;
|
||||
}
|
||||
|
@ -153,10 +126,6 @@ public class FieldDef implements Serializable {
|
|||
this.path = path;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("alwaysMatch")
|
||||
public class AlwaysMatch extends AbstractComparator {
|
||||
|
||||
public AlwaysMatch(final Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public AlwaysMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("exactMatch")
|
||||
public class ExactMatch extends AbstractComparator {
|
||||
|
||||
public ExactMatch(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public ExactMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@ComparatorClass("jaroWinkler")
|
||||
public class JaroWinkler extends AbstractComparator {
|
||||
|
||||
public JaroWinkler(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinkler(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("jaroWinklerNormalizedName")
|
||||
public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
public JaroWinklerNormalizedName(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerNormalizedName(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
if (sameCity(cities1,cities2)) {
|
||||
|
||||
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
cb = removeKeywords(cb, keywords2);
|
||||
cb = removeKeywords(cb, cities2);
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca,cb));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@ComparatorClass("jaroWinklerTitle")
|
||||
public class JaroWinklerTitle extends AbstractComparator {
|
||||
|
||||
public JaroWinklerTitle(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinklerTitle(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
boolean check = checkNumbers(ca, cb);
|
||||
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2JaroWinkler")
|
||||
public class Level2JaroWinkler extends AbstractComparator {
|
||||
|
||||
public Level2JaroWinkler(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public Level2JaroWinkler(double w) {
|
||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2JaroWinklerTitle")
|
||||
public class Level2JaroWinklerTitle extends AbstractComparator {
|
||||
|
||||
public Level2JaroWinklerTitle(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public Level2JaroWinklerTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return ssalgo.score(cca, ccb);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2Levenstein")
|
||||
public class Level2Levenstein extends AbstractComparator {
|
||||
|
||||
public Level2Levenstein(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
public Level2Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("levenstein")
|
||||
public class Levenstein extends AbstractComparator {
|
||||
|
||||
public Levenstein(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected Levenstein(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("levensteinTitle")
|
||||
public class LevensteinTitle extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
||||
|
||||
public LevensteinTitle(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
return 1 - (Math.abs(score) / Math.max(la, lb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
*/
|
||||
@ComparatorClass("levensteinTitleIgnoreVersion")
|
||||
public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
||||
|
||||
public LevensteinTitleIgnoreVersion(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitleIgnoreVersion(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
|
||||
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
return 1 - (Math.abs(score) / Math.max(la, lb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("mustBeDifferent")
|
||||
public class MustBeDifferent extends AbstractComparator {
|
||||
|
||||
public MustBeDifferent(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public MustBeDifferent(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return !a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
|
||||
* NullDistanceAlgo.
|
||||
*/
|
||||
@ComparatorClass("null")
|
||||
public class NullDistanceAlgo implements Comparator {
|
||||
|
||||
public NullDistanceAlgo(Map<String, Number> params){
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b) {
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@ComparatorClass("sortedJaroWinkler")
|
||||
public class SortedJaroWinkler extends AbstractSortedComparator {
|
||||
|
||||
public SortedJaroWinkler(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
*/
|
||||
public SortedJaroWinkler(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@ComparatorClass("sortedLevel2JaroWinkler")
|
||||
public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
*/
|
||||
public SortedLevel2JaroWinkler(final double weight) {
|
||||
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public SortedLevel2JaroWinkler(final Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SubStringLevenstein.
|
||||
*/
|
||||
@ComparatorClass("subStringLevenstein")
|
||||
public class SubStringLevenstein extends AbstractComparator {
|
||||
|
||||
/** The limit. */
|
||||
protected int limit;
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
*/
|
||||
public SubStringLevenstein(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public SubStringLevenstein(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
this.limit = params.get("limit").intValue();
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
* @param limit
|
||||
* the limit
|
||||
*/
|
||||
public SubStringLevenstein(final double w, final int limit) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
* @param limit
|
||||
* the limit
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("urlMatcher")
|
||||
public class UrlMatcher extends Levenstein {
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
public UrlMatcher(Map<String, Number> params){
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public UrlMatcher(double weight, Map<String, Number> params) {
|
||||
super(weight);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, Number> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b) {
|
||||
|
||||
final URL urlA = asUrl(getFirstValue(a));
|
||||
final URL urlB = asUrl(getFirstValue(b));
|
||||
|
||||
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Double hostW = params.get("host").doubleValue();
|
||||
Double pathW = params.get("path").doubleValue();
|
||||
|
||||
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
|
||||
return hostW * 0.5;
|
||||
}
|
||||
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator {
|
||||
|
||||
/** The ssalgo. */
|
||||
protected AbstractStringDistance ssalgo;
|
||||
|
||||
/** The weight. */
|
||||
protected double weight = 0.0;
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
protected AbstractComparator(Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||
this.params = params;
|
||||
this.weight = 1.0;
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
this.ssalgo = ssalgo;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
protected AbstractComparator(final AbstractStringDistance ssalgo){
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize.
|
||||
*
|
||||
* @param d
|
||||
* the d
|
||||
* @return the double
|
||||
*/
|
||||
protected abstract double normalize(double d);
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
public double distance(final String a, final String b) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1; //return -1 if a field is missing
|
||||
}
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
protected double distance(final List<String> a, final List<String> b) {
|
||||
return distance(concat(a), concat(b));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* To list.
|
||||
*
|
||||
* @param list
|
||||
* the list
|
||||
* @return the list
|
||||
*/
|
||||
protected List<String> toList(final Field list) {
|
||||
return ((FieldList) list).stringList();
|
||||
}
|
||||
|
||||
public double getWeight(){
|
||||
return this.weight;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractSortedComparator extends AbstractComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
protected AbstractSortedComparator(final Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||
super(params.get("weight").doubleValue(), ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> toList(final Field list) {
|
||||
FieldList fl = (FieldList) list;
|
||||
List<String> values = Lists.newArrayList(fl.stringList());
|
||||
Collections.sort(values);
|
||||
return values;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public enum AggType {
|
||||
|
||||
WEIGHTED_MEAN,
|
||||
AVG,
|
||||
SUM,
|
||||
MAX,
|
||||
MIN;
|
||||
|
||||
public static AggType getEnum(String value) {
|
||||
|
||||
try {
|
||||
return AggType.valueOf(value);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
throw new PaceException("Undefined aggregation type", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
public interface Comparator {
|
||||
|
||||
public double compare(Field a, Field b);
|
||||
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface ComparatorClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
@ -14,14 +14,25 @@ public class FieldConf implements Serializable {
|
|||
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||
private Map<String,Number> params; //parameters
|
||||
|
||||
private boolean ignoreMissing;
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
public FieldConf() {
|
||||
}
|
||||
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, Number> params) {
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean ignoreMissing) {
|
||||
this.field = field;
|
||||
this.comparator = comparator;
|
||||
this.weight = weight;
|
||||
this.params = params;
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
public String getField() {
|
|
@ -0,0 +1,18 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
public enum MatchType {
|
||||
|
||||
MATCH,
|
||||
NO_MATCH,
|
||||
UNDEFINED;
|
||||
|
||||
public static MatchType parse(String value) {
|
||||
|
||||
try {
|
||||
return MatchType.valueOf(value);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,157 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class TreeNodeDef implements Serializable {
|
||||
|
||||
private List<FieldConf> fields;
|
||||
private AggType aggregation;
|
||||
|
||||
private double threshold;
|
||||
|
||||
private String positive;
|
||||
private String negative;
|
||||
private String undefined;
|
||||
|
||||
boolean ignoreMissing;
|
||||
|
||||
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) {
|
||||
this.fields = fields;
|
||||
this.aggregation = aggregation;
|
||||
this.threshold = threshold;
|
||||
this.positive = positive;
|
||||
this.negative = negative;
|
||||
this.undefined = undefined;
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
public TreeNodeDef() {
|
||||
}
|
||||
|
||||
public double evaluate(MapDocument doc1, MapDocument doc2) {
|
||||
|
||||
DescriptiveStatistics stats = new DescriptiveStatistics();
|
||||
double sumWeights = 0.0; //for the weighted mean
|
||||
|
||||
int missCount = 0; //counter for the number of misses
|
||||
|
||||
for (FieldConf fieldConf : fields) {
|
||||
|
||||
double weight = fieldConf.getWeight();
|
||||
|
||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
|
||||
|
||||
if (result >= 0.0) { //if the field is not missing
|
||||
stats.addValue(weight * result);
|
||||
sumWeights += weight; //sum weights, to be used in case of weighted mean
|
||||
}
|
||||
else { //if the field is missing
|
||||
missCount += 1;
|
||||
if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored
|
||||
stats.addValue(weight * 0);
|
||||
sumWeights += weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//global ignoremissing (if one of the field is missing, return undefined)
|
||||
if (!ignoreMissing && missCount>0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
switch (aggregation){
|
||||
|
||||
case AVG:
|
||||
return stats.getMean();
|
||||
case SUM:
|
||||
return stats.getSum();
|
||||
case MAX:
|
||||
return stats.getMax();
|
||||
case MIN:
|
||||
return stats.getMin();
|
||||
case WEIGHTED_MEAN:
|
||||
return stats.getSum()/sumWeights;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Comparator comparator(final FieldConf field){
|
||||
|
||||
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
|
||||
}
|
||||
|
||||
public List<FieldConf> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
public void setFields(List<FieldConf> fields) {
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
public AggType getAggregation() {
|
||||
return aggregation;
|
||||
}
|
||||
|
||||
public void setAggregation(AggType aggregation) {
|
||||
this.aggregation = aggregation;
|
||||
}
|
||||
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public void setThreshold(double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public String getPositive() {
|
||||
return positive;
|
||||
}
|
||||
|
||||
public void setPositive(String positive) {
|
||||
this.positive = positive;
|
||||
}
|
||||
|
||||
public String getNegative() {
|
||||
return negative;
|
||||
}
|
||||
|
||||
public void setNegative(String negative) {
|
||||
this.negative = negative;
|
||||
}
|
||||
|
||||
public String getUndefined() {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
public void setUndefined(String undefined) {
|
||||
this.undefined = undefined;
|
||||
}
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -4,8 +4,8 @@ import com.google.common.collect.Lists;
|
|||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.WfConfig;
|
||||
import eu.dnetlib.pace.distance.PaceDocumentDistance;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
|
||||
import eu.dnetlib.pace.distance.PairwiseComparison;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
||||
|
@ -116,7 +116,7 @@ public class BlockProcessor {
|
|||
|
||||
private void process(final Queue<MapDocument> queue, final Reporter context) {
|
||||
|
||||
final PaceDocumentDistance algo = new PaceDocumentDistance();
|
||||
// final PaceDocumentDistance algo = new PaceDocumentDistance();
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
|
||||
|
@ -150,21 +150,23 @@ public class BlockProcessor {
|
|||
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
final ScoreResult sr = similarity(algo, pivot, curr);
|
||||
// log.info(sr.toString()+"SCORE "+ sr.getScore());
|
||||
emitOutput(sr, idPivot, idCurr, context);
|
||||
i++;
|
||||
final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf);
|
||||
|
||||
emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context);
|
||||
|
||||
// final ScoreResult sr = similarity(algo, pivot, curr);
|
||||
//// log.info(sr.toString()+"SCORE "+ sr.getScore());
|
||||
// emitOutput(sr, idPivot, idCurr, context);
|
||||
// i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void emitOutput(final ScoreResult sr, final String idPivot, final String idCurr, final Reporter context) {
|
||||
final double d = sr.getScore();
|
||||
|
||||
if (d >= dedupConf.getWf().getThreshold()) {
|
||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||
|
||||
if (result) {
|
||||
writeSimilarity(context, idPivot, idCurr);
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
||||
} else {
|
||||
|
@ -172,15 +174,6 @@ public class BlockProcessor {
|
|||
}
|
||||
}
|
||||
|
||||
private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
|
||||
try {
|
||||
return algo.between(a, b, dedupConf);
|
||||
} catch(Throwable e) {
|
||||
log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean mustSkip(final String idPivot) {
|
||||
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
|
||||
}
|
||||
|
|
|
@ -1390,7 +1390,7 @@ public class DiffPatchMatch {
|
|||
}
|
||||
|
||||
/**
|
||||
* Compute the Levenshtein distance; the number of inserted, deleted or
|
||||
* Compute the Levenshtein compare; the number of inserted, deleted or
|
||||
* substituted characters.
|
||||
* @param diffs List of Diff objects.
|
||||
* @return Number of changes.
|
||||
|
@ -1655,7 +1655,7 @@ public class DiffPatchMatch {
|
|||
score_threshold = score;
|
||||
best_loc = j - 1;
|
||||
if (best_loc > loc) {
|
||||
// When passing loc, don't exceed our current distance from loc.
|
||||
// When passing loc, don't exceed our current compare from loc.
|
||||
start = Math.max(1, 2 * loc - best_loc);
|
||||
} else {
|
||||
// Already passed loc, downhill from here on in.
|
||||
|
|
|
@ -7,6 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass;
|
|||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.reflections.Reflections;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -19,11 +21,13 @@ public class PaceResolver implements Serializable {
|
|||
|
||||
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
|
||||
public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition");
|
||||
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.distance.algo");
|
||||
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo");
|
||||
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
|
||||
|
||||
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
||||
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
|
||||
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
|
||||
private final Map<String, Class<Comparator>> comparators;
|
||||
|
||||
public PaceResolver() {
|
||||
|
||||
|
@ -38,6 +42,10 @@ public class PaceResolver implements Serializable {
|
|||
this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream()
|
||||
.filter(DistanceAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||
|
||||
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
|
||||
.filter(Comparator.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
|
||||
}
|
||||
|
||||
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
||||
|
@ -64,4 +72,12 @@ public class PaceResolver implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
|
||||
try {
|
||||
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -16,9 +16,9 @@
|
|||
"pace" : {
|
||||
"clustering" : [
|
||||
],
|
||||
"strictConditions" : [
|
||||
"sufficientConditions" : [
|
||||
],
|
||||
"conditions" : [
|
||||
"necessaryConditions" : [
|
||||
],
|
||||
"model" : [
|
||||
],
|
||||
|
|
|
@ -16,10 +16,10 @@
|
|||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
||||
],
|
||||
"strictConditions" : [
|
||||
"sufficientConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
|
||||
],
|
||||
"conditions" : [
|
||||
"necessaryConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] },
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
|
||||
],
|
||||
|
|
Loading…
Reference in New Issue