master #59
|
@ -9,7 +9,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Abstract conditions needs a list of field names.
|
||||
* Abstract necessaryConditions needs a list of field names.
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
|
|
|
@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Document;
|
|||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Allows to express general conditions to be satisfied or not between two Documents.
|
||||
* Allows to express general necessaryConditions to be satisfied or not between two Documents.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
|||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
|
||||
/**
|
||||
* Interface for PACE configuration bean.
|
||||
|
@ -21,6 +22,9 @@ public interface Config {
|
|||
*/
|
||||
public List<FieldDef> model();
|
||||
|
||||
|
||||
public Map<String, TreeNodeDef> decisionTree();
|
||||
|
||||
/**
|
||||
* Field configuration definitions.
|
||||
*
|
||||
|
@ -31,16 +35,16 @@ public interface Config {
|
|||
/**
|
||||
* Strict Pre-Condition definitions.
|
||||
*
|
||||
* @return the list of conditions
|
||||
* @return the list of necessaryConditions
|
||||
*/
|
||||
public List<ConditionAlgo> strictConditions();
|
||||
public List<ConditionAlgo> sufficientConditions();
|
||||
|
||||
/**
|
||||
* Pre-Condition definitions.
|
||||
*
|
||||
* @return the list of conditions
|
||||
* @return the list of necessaryConditions
|
||||
*/
|
||||
public List<ConditionAlgo> conditions();
|
||||
public List<ConditionAlgo> necessaryConditions();
|
||||
|
||||
/**
|
||||
* Clusterings.
|
||||
|
|
|
@ -8,6 +8,7 @@ import java.util.Map;
|
|||
import java.util.Map.Entry;
|
||||
import java.util.function.BiFunction;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.antlr.stringtemplate.StringTemplate;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -114,6 +115,11 @@ public class DedupConfig implements Config, Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, TreeNodeDef> decisionTree(){
|
||||
return getPace().getDecisionTree();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<FieldDef> model() {
|
||||
return getPace().getModel();
|
||||
|
@ -125,12 +131,12 @@ public class DedupConfig implements Config, Serializable {
|
|||
}
|
||||
|
||||
@Override
|
||||
public List<ConditionAlgo> strictConditions() {
|
||||
public List<ConditionAlgo> sufficientConditions() {
|
||||
return getPace().getStrictConditionAlgos();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ConditionAlgo> conditions() {
|
||||
public List<ConditionAlgo> necessaryConditions() {
|
||||
return getPace().getConditionAlgos();
|
||||
}
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo;
|
|||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.CondDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
|
@ -18,9 +19,12 @@ import java.util.stream.Collectors;
|
|||
public class PaceConfig implements Serializable {
|
||||
|
||||
private List<FieldDef> model;
|
||||
private List<CondDef> strictConditions;
|
||||
private List<CondDef> conditions;
|
||||
|
||||
private List<CondDef> sufficientConditions;
|
||||
private List<CondDef> necessaryConditions;
|
||||
private List<ClusteringDef> clustering;
|
||||
private Map<String, TreeNodeDef> decisionTree;
|
||||
|
||||
private Map<String, List<String>> blacklists;
|
||||
|
||||
@JsonIgnore
|
||||
|
@ -46,30 +50,30 @@ public class PaceConfig implements Serializable {
|
|||
this.model = model;
|
||||
}
|
||||
|
||||
public List<CondDef> getStrictConditions() {
|
||||
return strictConditions;
|
||||
public List<CondDef> getSufficientConditions() {
|
||||
return sufficientConditions;
|
||||
}
|
||||
|
||||
public void setStrictConditions(final List<CondDef> strictConditions) {
|
||||
this.strictConditions = strictConditions;
|
||||
public void setSufficientConditions(final List<CondDef> sufficientConditions) {
|
||||
this.sufficientConditions = sufficientConditions;
|
||||
}
|
||||
|
||||
public List<CondDef> getConditions() {
|
||||
return conditions;
|
||||
public List<CondDef> getNecessaryConditions() {
|
||||
return necessaryConditions;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public List<ConditionAlgo> getConditionAlgos() {
|
||||
return asConditionAlgos(getConditions());
|
||||
return asConditionAlgos(getNecessaryConditions());
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public List<ConditionAlgo> getStrictConditionAlgos() {
|
||||
return asConditionAlgos(getStrictConditions());
|
||||
return asConditionAlgos(getSufficientConditions());
|
||||
}
|
||||
|
||||
public void setConditions(final List<CondDef> conditions) {
|
||||
this.conditions = conditions;
|
||||
public void setNecessaryConditions(final List<CondDef> necessaryConditions) {
|
||||
this.necessaryConditions = necessaryConditions;
|
||||
}
|
||||
|
||||
public List<ClusteringDef> getClustering() {
|
||||
|
@ -80,6 +84,14 @@ public class PaceConfig implements Serializable {
|
|||
this.clustering = clustering;
|
||||
}
|
||||
|
||||
public Map<String, TreeNodeDef> getDecisionTree() {
|
||||
return decisionTree;
|
||||
}
|
||||
|
||||
public void setDecisionTree(Map<String, TreeNodeDef> decisionTree) {
|
||||
this.decisionTree = decisionTree;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getBlacklists() {
|
||||
return blacklists;
|
||||
}
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
|
||||
public abstract class AbstractDistance<A> implements Distance<A> {
|
||||
|
||||
protected abstract Document toDocument(A a);
|
||||
|
||||
@Override
|
||||
public ScoreResult between(final A a, final A b, final Config config) {
|
||||
return new DistanceScorer(config).distance(toDocument(a), toDocument(b));
|
||||
}
|
||||
}
|
||||
//package eu.dnetlib.pace.distance;
|
||||
//
|
||||
//import eu.dnetlib.pace.config.Config;
|
||||
//import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
//import eu.dnetlib.pace.model.Document;
|
||||
//
|
||||
//public abstract class AbstractDistance<A> implements Distance<A> {
|
||||
//
|
||||
// protected abstract Document toDocument(A a);
|
||||
//
|
||||
// @Override
|
||||
// public boolean between(final A a, final A b, final Config config) {
|
||||
// return new PairwiseComparison(config).compare(toDocument(a), toDocument(b));
|
||||
// }
|
||||
//}
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
|
||||
public interface Distance<A> {
|
||||
|
||||
public ScoreResult between(A a, A b, Config config);
|
||||
public boolean between(A a, A b, Config config);
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
|
||||
* Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two
|
||||
* objects.
|
||||
*/
|
||||
public interface DistanceAlgo {
|
||||
|
|
|
@ -1,126 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.distance.eval.DistanceEval;
|
||||
import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* The distance between two documents is given by the weighted mean of the field distances
|
||||
*/
|
||||
public class DistanceScorer {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DistanceScorer.class);
|
||||
|
||||
private Config config;
|
||||
|
||||
public DistanceScorer(final Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public ScoreResult distance(final Document a, final Document b) {
|
||||
final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
|
||||
|
||||
sr.setStrictConditions(verify(a, b, config.strictConditions()));
|
||||
sr.setConditions(verify(a, b, config.conditions()));
|
||||
|
||||
final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model()));
|
||||
|
||||
for (final FieldDef fd : config.model()) {
|
||||
|
||||
dMap.updateDistance(fieldDistance(a, b, fd));
|
||||
}
|
||||
sr.setDistances(dMap);
|
||||
return sr;
|
||||
}
|
||||
|
||||
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
|
||||
final ConditionEvalMap res = new ConditionEvalMap();
|
||||
|
||||
for (final ConditionAlgo cd : conditions) {
|
||||
final ConditionEvalMap map = cd.verify(a, b);
|
||||
res.mergeFrom(map);
|
||||
|
||||
// commented out shortcuts
|
||||
/*
|
||||
if (map.anyNegative()) {
|
||||
return res;
|
||||
}
|
||||
*/
|
||||
|
||||
//if (strict && (res < 0)) return -1;
|
||||
//cond += verify;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) {
|
||||
|
||||
final double w = fd.getWeight();
|
||||
final Field va = getValue(a, fd);
|
||||
final Field vb = getValue(b, fd);
|
||||
|
||||
final DistanceEval de = new DistanceEval(fd, va, vb);
|
||||
if ((w == 0)) return de; // optimization for 0 weight
|
||||
else {
|
||||
if (va.isEmpty() || vb.isEmpty()) {
|
||||
if (fd.isIgnoreMissing()) {
|
||||
de.setDistance(-1);
|
||||
} else {
|
||||
de.setDistance(w);
|
||||
}
|
||||
} else {
|
||||
if (va.getType().equals(vb.getType())) {
|
||||
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
|
||||
} else {
|
||||
throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
||||
}
|
||||
}
|
||||
return de;
|
||||
}
|
||||
}
|
||||
|
||||
private Field getValue(final Document d, final FieldDef fd) {
|
||||
final Field v = d.values(fd.getName());
|
||||
if (fd.getLength() > 0) {
|
||||
|
||||
if (v instanceof FieldValueImpl) {
|
||||
((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
|
||||
} else if (v instanceof FieldListImpl) {
|
||||
List<String> strings = ((FieldListImpl) v).stringList();
|
||||
strings = strings.stream()
|
||||
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
.collect(Collectors.toList());
|
||||
((FieldListImpl) v).clear();
|
||||
((FieldListImpl) v).addAll(strings.stream()
|
||||
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
.map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
private double sumWeights(final Collection<FieldDef> fields) {
|
||||
double sum = 0.0;
|
||||
for (final FieldDef fd : fields) {
|
||||
sum += fd.getWeight();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,12 +1,12 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
|
||||
public class PaceDocumentDistance extends AbstractDistance<Document> {
|
||||
|
||||
@Override
|
||||
protected Document toDocument(Document a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
}
|
||||
//package eu.dnetlib.pace.distance;
|
||||
//
|
||||
//import eu.dnetlib.pace.model.Document;
|
||||
//
|
||||
//public class PaceDocumentDistance extends AbstractDistance<Document> {
|
||||
//
|
||||
// @Override
|
||||
// protected Document toDocument(Document a) {
|
||||
// return a;
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import eu.dnetlib.pace.tree.support.MatchType;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import eu.dnetlib.pace.util.Reporter;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The compare between two documents is given by the weighted mean of the field distances
|
||||
*/
|
||||
public class PairwiseComparison {
|
||||
|
||||
private static final Log log = LogFactory.getLog(PairwiseComparison.class);
|
||||
|
||||
private Config config;
|
||||
|
||||
public PairwiseComparison(final Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public boolean compare(final MapDocument a, final MapDocument b) {
|
||||
|
||||
//verify sufficientConditions
|
||||
if (verify(a, b, config.sufficientConditions()).result() > 0)
|
||||
return true;
|
||||
|
||||
//verify necessaryConditions
|
||||
if (verify(a, b, config.necessaryConditions()).result() < 0)
|
||||
return false;
|
||||
|
||||
//evaluate the decision tree
|
||||
return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH;
|
||||
}
|
||||
|
||||
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
|
||||
final ConditionEvalMap res = new ConditionEvalMap();
|
||||
|
||||
for (final ConditionAlgo cd : conditions) {
|
||||
final ConditionEvalMap map = cd.verify(a, b);
|
||||
res.mergeFrom(map);
|
||||
|
||||
// commented out shortcuts
|
||||
/*
|
||||
if (map.anyNegative()) {
|
||||
return res;
|
||||
}
|
||||
*/
|
||||
|
||||
//if (strict && (res < 0)) return -1;
|
||||
//cond += verify;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map<String, TreeNodeDef> decisionTree){
|
||||
|
||||
String current = "start";
|
||||
double similarity;
|
||||
|
||||
while (MatchType.parse(current)==MatchType.UNDEFINED) {
|
||||
|
||||
TreeNodeDef currentNode = decisionTree.get(current);
|
||||
//throw an exception if the node doesn't exist
|
||||
if (currentNode == null)
|
||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
||||
|
||||
similarity = currentNode.evaluate(doc1, doc2);
|
||||
|
||||
if (similarity == -1) {
|
||||
current = currentNode.getUndefined();
|
||||
}
|
||||
else if (similarity>=currentNode.getThreshold()){
|
||||
current = currentNode.getPositive();
|
||||
}
|
||||
else {
|
||||
current = currentNode.getNegative();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return MatchType.parse(current);
|
||||
}
|
||||
|
||||
// private Field getValue(final Document d, final FieldDef fd) {
|
||||
// final Field v = d.values(fd.getName());
|
||||
// if (fd.getLength() > 0) {
|
||||
//
|
||||
// if (v instanceof FieldValueImpl) {
|
||||
// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
|
||||
// } else if (v instanceof FieldListImpl) {
|
||||
// List<String> strings = ((FieldListImpl) v).stringList();
|
||||
// strings = strings.stream()
|
||||
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
// .collect(Collectors.toList());
|
||||
// ((FieldListImpl) v).clear();
|
||||
// ((FieldListImpl) v).addAll(strings.stream()
|
||||
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
|
||||
// .collect(Collectors.toList()));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return v;
|
||||
// }
|
||||
//
|
||||
// private double sumWeights(final Collection<FieldDef> fields) {
|
||||
// double sum = 0.0;
|
||||
// for (final FieldDef fd : fields) {
|
||||
// sum += fd.getWeight();
|
||||
// }
|
||||
// return sum;
|
||||
// }
|
||||
|
||||
}
|
|
@ -12,7 +12,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
/**
|
||||
* For the rest of the fields delegate the distance measure to the second string library.
|
||||
* For the rest of the fields delegate the compare measure to the second string library.
|
||||
*/
|
||||
public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo {
|
||||
|
||||
|
@ -35,7 +35,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new second string distance algo.
|
||||
* Instantiates a new second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
|
@ -90,7 +90,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b) {
|
||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.distance.algo;
|
|||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.DistanceScorer;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
|
|
@ -7,7 +7,7 @@ import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Compared distance between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
*/
|
||||
@DistanceClass("LevensteinTitleIgnoreVersion")
|
||||
public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo {
|
||||
|
|
|
@ -7,7 +7,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
|
||||
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
|
||||
* NullDistanceAlgo.
|
||||
*/
|
||||
@DistanceClass("Null")
|
||||
|
|
|
@ -40,7 +40,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
|
@ -50,7 +50,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
|
|
|
@ -40,7 +40,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
|
@ -50,7 +50,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
|
|
|
@ -12,12 +12,12 @@ import eu.dnetlib.pace.model.Field;
|
|||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
/**
|
||||
* For the rest of the fields delegate the distance measure to the second string library.
|
||||
* For the rest of the fields delegate the compare measure to the second string library.
|
||||
*/
|
||||
public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted second string distance algo.
|
||||
* Instantiates a new sorted second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
|
@ -35,7 +35,7 @@ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanc
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
protected List<String> toList(final Field list) {
|
||||
|
|
|
@ -66,7 +66,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b) {
|
||||
|
@ -79,7 +79,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
|
@ -89,7 +89,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
|||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
|
|
|
@ -1,32 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Created by claudio on 10/03/16.
|
||||
*/
|
||||
public class DistanceEvalMap extends HashMap<String, DistanceEval> {
|
||||
|
||||
private double sumWeights;
|
||||
|
||||
private double sumDistances = 0.0;
|
||||
|
||||
public DistanceEvalMap(final double sumWeights) {
|
||||
this.sumWeights = sumWeights;
|
||||
}
|
||||
|
||||
public void updateDistance(final DistanceEval d) {
|
||||
|
||||
put(d.getFieldDef().getName(), d);
|
||||
if (d.getDistance() >= 0) {
|
||||
sumDistances += d.getDistance();
|
||||
} else {
|
||||
sumWeights -= d.getFieldDef().getWeight();
|
||||
}
|
||||
}
|
||||
|
||||
public double distance() {
|
||||
return sumWeights == 0 ? 0 : sumDistances / sumWeights;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,62 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Created by claudio on 09/03/16.
|
||||
*/
|
||||
public class ScoreResult implements Serializable {
|
||||
|
||||
private ConditionEvalMap strictConditions;
|
||||
|
||||
private ConditionEvalMap conditions;
|
||||
|
||||
private DistanceEvalMap distances;
|
||||
|
||||
public double getScore() {
|
||||
|
||||
if (getStrictConditions().result() > 0) return 1.0;
|
||||
// if (getStrictConditions().result() < 0) return 0.0;
|
||||
if (getConditions().result() < 0) return 0.0;
|
||||
|
||||
return getDistances().distance();
|
||||
}
|
||||
|
||||
|
||||
public ConditionEvalMap getStrictConditions() {
|
||||
return strictConditions;
|
||||
}
|
||||
|
||||
public void setStrictConditions(final ConditionEvalMap strictConditions) {
|
||||
this.strictConditions = strictConditions;
|
||||
}
|
||||
|
||||
public ConditionEvalMap getConditions() {
|
||||
return conditions;
|
||||
}
|
||||
|
||||
public void setConditions(final ConditionEvalMap conditions) {
|
||||
this.conditions = conditions;
|
||||
}
|
||||
|
||||
public DistanceEvalMap getDistances() {
|
||||
return distances;
|
||||
}
|
||||
|
||||
public void setDistances(final DistanceEvalMap distances) {
|
||||
this.distances = distances;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,25 +14,29 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
|
||||
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
|
||||
*/
|
||||
public class FieldDef implements Serializable {
|
||||
|
||||
public final static String PATH_SEPARATOR = "/";
|
||||
|
||||
private String algo;
|
||||
|
||||
private String name;
|
||||
|
||||
private String path;
|
||||
|
||||
private boolean ignoreMissing;
|
||||
|
||||
private Type type;
|
||||
|
||||
private boolean overrideMatch;
|
||||
private boolean ignoreMissing;
|
||||
|
||||
private double weight;
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
private boolean overrideMatch;
|
||||
|
||||
/**
|
||||
* Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
|
||||
|
@ -74,20 +78,6 @@ public class FieldDef implements Serializable {
|
|||
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
|
||||
}
|
||||
|
||||
public DistanceAlgo distanceAlgo() {
|
||||
|
||||
if (params == null) {
|
||||
params = new HashMap<>();
|
||||
}
|
||||
|
||||
params.put("weight", getWeight());
|
||||
return PaceConfig.resolver.getDistanceAlgo(getAlgo(), params);
|
||||
}
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public Type getType() {
|
||||
return type;
|
||||
}
|
||||
|
@ -104,23 +94,6 @@ public class FieldDef implements Serializable {
|
|||
this.overrideMatch = overrideMatch;
|
||||
}
|
||||
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void setWeight(final double weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public String getAlgo() {
|
||||
return algo;
|
||||
}
|
||||
|
||||
public void setAlgo(final String algo) {
|
||||
this.algo = algo;
|
||||
}
|
||||
|
||||
|
||||
public int getSize() {
|
||||
return size;
|
||||
}
|
||||
|
@ -153,10 +126,6 @@ public class FieldDef implements Serializable {
|
|||
this.path = path;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("alwaysMatch")
|
||||
public class AlwaysMatch extends AbstractComparator {
|
||||
|
||||
public AlwaysMatch(final Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public AlwaysMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("exactMatch")
|
||||
public class ExactMatch extends AbstractComparator {
|
||||
|
||||
public ExactMatch(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public ExactMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@ComparatorClass("jaroWinkler")
|
||||
public class JaroWinkler extends AbstractComparator {
|
||||
|
||||
public JaroWinkler(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinkler(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("jaroWinklerNormalizedName")
|
||||
public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
public JaroWinklerNormalizedName(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerNormalizedName(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
if (sameCity(cities1,cities2)) {
|
||||
|
||||
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
cb = removeKeywords(cb, keywords2);
|
||||
cb = removeKeywords(cb, cities2);
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca,cb));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@ComparatorClass("jaroWinklerTitle")
|
||||
public class JaroWinklerTitle extends AbstractComparator {
|
||||
|
||||
public JaroWinklerTitle(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinklerTitle(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
boolean check = checkNumbers(ca, cb);
|
||||
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2JaroWinkler")
|
||||
public class Level2JaroWinkler extends AbstractComparator {
|
||||
|
||||
public Level2JaroWinkler(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public Level2JaroWinkler(double w) {
|
||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2JaroWinklerTitle")
|
||||
public class Level2JaroWinklerTitle extends AbstractComparator {
|
||||
|
||||
public Level2JaroWinklerTitle(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public Level2JaroWinklerTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return ssalgo.score(cca, ccb);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2Levenstein")
|
||||
public class Level2Levenstein extends AbstractComparator {
|
||||
|
||||
public Level2Levenstein(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
public Level2Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("levenstein")
|
||||
public class Levenstein extends AbstractComparator {
|
||||
|
||||
public Levenstein(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected Levenstein(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("levensteinTitle")
|
||||
public class LevensteinTitle extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
||||
|
||||
public LevensteinTitle(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
return 1 - (Math.abs(score) / Math.max(la, lb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
*/
|
||||
@ComparatorClass("levensteinTitleIgnoreVersion")
|
||||
public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
||||
|
||||
public LevensteinTitleIgnoreVersion(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitleIgnoreVersion(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
|
||||
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
return 1 - (Math.abs(score) / Math.max(la, lb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("mustBeDifferent")
|
||||
public class MustBeDifferent extends AbstractComparator {
|
||||
|
||||
public MustBeDifferent(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public MustBeDifferent(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return !a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
|
||||
* NullDistanceAlgo.
|
||||
*/
|
||||
@ComparatorClass("null")
|
||||
public class NullDistanceAlgo implements Comparator {
|
||||
|
||||
public NullDistanceAlgo(Map<String, Number> params){
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b) {
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@ComparatorClass("sortedJaroWinkler")
|
||||
public class SortedJaroWinkler extends AbstractSortedComparator {
|
||||
|
||||
public SortedJaroWinkler(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
*/
|
||||
public SortedJaroWinkler(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@ComparatorClass("sortedLevel2JaroWinkler")
|
||||
public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
*/
|
||||
public SortedLevel2JaroWinkler(final double weight) {
|
||||
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public SortedLevel2JaroWinkler(final Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SubStringLevenstein.
|
||||
*/
|
||||
@ComparatorClass("subStringLevenstein")
|
||||
public class SubStringLevenstein extends AbstractComparator {
|
||||
|
||||
/** The limit. */
|
||||
protected int limit;
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
*/
|
||||
public SubStringLevenstein(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public SubStringLevenstein(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
this.limit = params.get("limit").intValue();
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
* @param limit
|
||||
* the limit
|
||||
*/
|
||||
public SubStringLevenstein(final double w, final int limit) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
* @param limit
|
||||
* the limit
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("urlMatcher")
|
||||
public class UrlMatcher extends Levenstein {
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
public UrlMatcher(Map<String, Number> params){
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public UrlMatcher(double weight, Map<String, Number> params) {
|
||||
super(weight);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, Number> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b) {
|
||||
|
||||
final URL urlA = asUrl(getFirstValue(a));
|
||||
final URL urlB = asUrl(getFirstValue(b));
|
||||
|
||||
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Double hostW = params.get("host").doubleValue();
|
||||
Double pathW = params.get("path").doubleValue();
|
||||
|
||||
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
|
||||
return hostW * 0.5;
|
||||
}
|
||||
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator {
|
||||
|
||||
/** The ssalgo. */
|
||||
protected AbstractStringDistance ssalgo;
|
||||
|
||||
/** The weight. */
|
||||
protected double weight = 0.0;
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
protected AbstractComparator(Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||
this.params = params;
|
||||
this.weight = 1.0;
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
this.ssalgo = ssalgo;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
protected AbstractComparator(final AbstractStringDistance ssalgo){
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize.
|
||||
*
|
||||
* @param d
|
||||
* the d
|
||||
* @return the double
|
||||
*/
|
||||
protected abstract double normalize(double d);
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
public double distance(final String a, final String b) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1; //return -1 if a field is missing
|
||||
}
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
protected double distance(final List<String> a, final List<String> b) {
|
||||
return distance(concat(a), concat(b));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* To list.
|
||||
*
|
||||
* @param list
|
||||
* the list
|
||||
* @return the list
|
||||
*/
|
||||
protected List<String> toList(final Field list) {
|
||||
return ((FieldList) list).stringList();
|
||||
}
|
||||
|
||||
public double getWeight(){
|
||||
return this.weight;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractSortedComparator extends AbstractComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
protected AbstractSortedComparator(final Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||
super(params.get("weight").doubleValue(), ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> toList(final Field list) {
|
||||
FieldList fl = (FieldList) list;
|
||||
List<String> values = Lists.newArrayList(fl.stringList());
|
||||
Collections.sort(values);
|
||||
return values;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public enum AggType {
|
||||
|
||||
WEIGHTED_MEAN,
|
||||
AVG,
|
||||
SUM,
|
||||
MAX,
|
||||
MIN;
|
||||
|
||||
public static AggType getEnum(String value) {
|
||||
|
||||
try {
|
||||
return AggType.valueOf(value);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
throw new PaceException("Undefined aggregation type", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
public interface Comparator {
|
||||
|
||||
public double compare(Field a, Field b);
|
||||
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface ComparatorClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
@ -14,14 +14,25 @@ public class FieldConf implements Serializable {
|
|||
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||
private Map<String,Number> params; //parameters
|
||||
|
||||
private boolean ignoreMissing;
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
public FieldConf() {
|
||||
}
|
||||
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, Number> params) {
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean ignoreMissing) {
|
||||
this.field = field;
|
||||
this.comparator = comparator;
|
||||
this.weight = weight;
|
||||
this.params = params;
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
public String getField() {
|
|
@ -0,0 +1,18 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
public enum MatchType {
|
||||
|
||||
MATCH,
|
||||
NO_MATCH,
|
||||
UNDEFINED;
|
||||
|
||||
public static MatchType parse(String value) {
|
||||
|
||||
try {
|
||||
return MatchType.valueOf(value);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,157 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class TreeNodeDef implements Serializable {
|
||||
|
||||
private List<FieldConf> fields;
|
||||
private AggType aggregation;
|
||||
|
||||
private double threshold;
|
||||
|
||||
private String positive;
|
||||
private String negative;
|
||||
private String undefined;
|
||||
|
||||
boolean ignoreMissing;
|
||||
|
||||
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) {
|
||||
this.fields = fields;
|
||||
this.aggregation = aggregation;
|
||||
this.threshold = threshold;
|
||||
this.positive = positive;
|
||||
this.negative = negative;
|
||||
this.undefined = undefined;
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
public TreeNodeDef() {
|
||||
}
|
||||
|
||||
public double evaluate(MapDocument doc1, MapDocument doc2) {
|
||||
|
||||
DescriptiveStatistics stats = new DescriptiveStatistics();
|
||||
double sumWeights = 0.0; //for the weighted mean
|
||||
|
||||
int missCount = 0; //counter for the number of misses
|
||||
|
||||
for (FieldConf fieldConf : fields) {
|
||||
|
||||
double weight = fieldConf.getWeight();
|
||||
|
||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
|
||||
|
||||
if (result >= 0.0) { //if the field is not missing
|
||||
stats.addValue(weight * result);
|
||||
sumWeights += weight; //sum weights, to be used in case of weighted mean
|
||||
}
|
||||
else { //if the field is missing
|
||||
missCount += 1;
|
||||
if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored
|
||||
stats.addValue(weight * 0);
|
||||
sumWeights += weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//global ignoremissing (if one of the field is missing, return undefined)
|
||||
if (!ignoreMissing && missCount>0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
switch (aggregation){
|
||||
|
||||
case AVG:
|
||||
return stats.getMean();
|
||||
case SUM:
|
||||
return stats.getSum();
|
||||
case MAX:
|
||||
return stats.getMax();
|
||||
case MIN:
|
||||
return stats.getMin();
|
||||
case WEIGHTED_MEAN:
|
||||
return stats.getSum()/sumWeights;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Comparator comparator(final FieldConf field){
|
||||
|
||||
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
|
||||
}
|
||||
|
||||
public List<FieldConf> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
public void setFields(List<FieldConf> fields) {
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
public AggType getAggregation() {
|
||||
return aggregation;
|
||||
}
|
||||
|
||||
public void setAggregation(AggType aggregation) {
|
||||
this.aggregation = aggregation;
|
||||
}
|
||||
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public void setThreshold(double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public String getPositive() {
|
||||
return positive;
|
||||
}
|
||||
|
||||
public void setPositive(String positive) {
|
||||
this.positive = positive;
|
||||
}
|
||||
|
||||
public String getNegative() {
|
||||
return negative;
|
||||
}
|
||||
|
||||
public void setNegative(String negative) {
|
||||
this.negative = negative;
|
||||
}
|
||||
|
||||
public String getUndefined() {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
public void setUndefined(String undefined) {
|
||||
this.undefined = undefined;
|
||||
}
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -4,8 +4,8 @@ import com.google.common.collect.Lists;
|
|||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.WfConfig;
|
||||
import eu.dnetlib.pace.distance.PaceDocumentDistance;
|
||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
|
||||
import eu.dnetlib.pace.distance.PairwiseComparison;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
||||
|
@ -116,7 +116,7 @@ public class BlockProcessor {
|
|||
|
||||
private void process(final Queue<MapDocument> queue, final Reporter context) {
|
||||
|
||||
final PaceDocumentDistance algo = new PaceDocumentDistance();
|
||||
// final PaceDocumentDistance algo = new PaceDocumentDistance();
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
|
||||
|
@ -150,21 +150,23 @@ public class BlockProcessor {
|
|||
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
final ScoreResult sr = similarity(algo, pivot, curr);
|
||||
// log.info(sr.toString()+"SCORE "+ sr.getScore());
|
||||
emitOutput(sr, idPivot, idCurr, context);
|
||||
i++;
|
||||
final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf);
|
||||
|
||||
emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context);
|
||||
|
||||
// final ScoreResult sr = similarity(algo, pivot, curr);
|
||||
//// log.info(sr.toString()+"SCORE "+ sr.getScore());
|
||||
// emitOutput(sr, idPivot, idCurr, context);
|
||||
// i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void emitOutput(final ScoreResult sr, final String idPivot, final String idCurr, final Reporter context) {
|
||||
final double d = sr.getScore();
|
||||
|
||||
if (d >= dedupConf.getWf().getThreshold()) {
|
||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||
|
||||
if (result) {
|
||||
writeSimilarity(context, idPivot, idCurr);
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
||||
} else {
|
||||
|
@ -172,15 +174,6 @@ public class BlockProcessor {
|
|||
}
|
||||
}
|
||||
|
||||
private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
|
||||
try {
|
||||
return algo.between(a, b, dedupConf);
|
||||
} catch(Throwable e) {
|
||||
log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean mustSkip(final String idPivot) {
|
||||
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
|
||||
}
|
||||
|
|
|
@ -1390,7 +1390,7 @@ public class DiffPatchMatch {
|
|||
}
|
||||
|
||||
/**
|
||||
* Compute the Levenshtein distance; the number of inserted, deleted or
|
||||
* Compute the Levenshtein compare; the number of inserted, deleted or
|
||||
* substituted characters.
|
||||
* @param diffs List of Diff objects.
|
||||
* @return Number of changes.
|
||||
|
@ -1655,7 +1655,7 @@ public class DiffPatchMatch {
|
|||
score_threshold = score;
|
||||
best_loc = j - 1;
|
||||
if (best_loc > loc) {
|
||||
// When passing loc, don't exceed our current distance from loc.
|
||||
// When passing loc, don't exceed our current compare from loc.
|
||||
start = Math.max(1, 2 * loc - best_loc);
|
||||
} else {
|
||||
// Already passed loc, downhill from here on in.
|
||||
|
|
|
@ -7,6 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass;
|
|||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.reflections.Reflections;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -19,11 +21,13 @@ public class PaceResolver implements Serializable {
|
|||
|
||||
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
|
||||
public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition");
|
||||
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.distance.algo");
|
||||
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo");
|
||||
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
|
||||
|
||||
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
||||
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
|
||||
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
|
||||
private final Map<String, Class<Comparator>> comparators;
|
||||
|
||||
public PaceResolver() {
|
||||
|
||||
|
@ -38,6 +42,10 @@ public class PaceResolver implements Serializable {
|
|||
this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream()
|
||||
.filter(DistanceAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||
|
||||
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
|
||||
.filter(Comparator.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
|
||||
}
|
||||
|
||||
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
||||
|
@ -64,4 +72,12 @@ public class PaceResolver implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
|
||||
try {
|
||||
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -16,9 +16,9 @@
|
|||
"pace" : {
|
||||
"clustering" : [
|
||||
],
|
||||
"strictConditions" : [
|
||||
"sufficientConditions" : [
|
||||
],
|
||||
"conditions" : [
|
||||
"necessaryConditions" : [
|
||||
],
|
||||
"model" : [
|
||||
],
|
||||
|
|
|
@ -16,10 +16,10 @@
|
|||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
||||
],
|
||||
"strictConditions" : [
|
||||
"sufficientConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
|
||||
],
|
||||
"conditions" : [
|
||||
"necessaryConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] },
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
|
||||
],
|
||||
|
|
Loading…
Reference in New Issue