master #59

Closed
claudio.atzori wants to merge 3221 commits from master into stable_ids
52 changed files with 1517 additions and 357 deletions
Showing only changes of commit a5c5d2f01b - Show all commits

View File

@ -9,7 +9,7 @@ import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* Abstract conditions needs a list of field names.
* Abstract necessaryConditions needs a list of field names.
*
* @author claudio
*

View File

@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.FieldDef;
/**
* Allows to express general conditions to be satisfied or not between two Documents.
* Allows to express general necessaryConditions to be satisfied or not between two Documents.
*
* @author claudio
*/

View File

@ -6,6 +6,7 @@ import java.util.Map;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
/**
* Interface for PACE configuration bean.
@ -21,6 +22,9 @@ public interface Config {
*/
public List<FieldDef> model();
public Map<String, TreeNodeDef> decisionTree();
/**
* Field configuration definitions.
*
@ -31,16 +35,16 @@ public interface Config {
/**
* Strict Pre-Condition definitions.
*
* @return the list of conditions
* @return the list of necessaryConditions
*/
public List<ConditionAlgo> strictConditions();
public List<ConditionAlgo> sufficientConditions();
/**
* Pre-Condition definitions.
*
* @return the list of conditions
* @return the list of necessaryConditions
*/
public List<ConditionAlgo> conditions();
public List<ConditionAlgo> necessaryConditions();
/**
* Clusterings.

View File

@ -8,6 +8,7 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.function.BiFunction;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
@ -114,6 +115,11 @@ public class DedupConfig implements Config, Serializable {
}
}
@Override
public Map<String, TreeNodeDef> decisionTree(){
return getPace().getDecisionTree();
}
@Override
public List<FieldDef> model() {
return getPace().getModel();
@ -125,12 +131,12 @@ public class DedupConfig implements Config, Serializable {
}
@Override
public List<ConditionAlgo> strictConditions() {
public List<ConditionAlgo> sufficientConditions() {
return getPace().getStrictConditionAlgos();
}
@Override
public List<ConditionAlgo> conditions() {
public List<ConditionAlgo> necessaryConditions() {
return getPace().getConditionAlgos();
}

View File

@ -6,6 +6,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.CondDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceResolver;
import org.apache.commons.collections.CollectionUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
@ -18,9 +19,12 @@ import java.util.stream.Collectors;
public class PaceConfig implements Serializable {
private List<FieldDef> model;
private List<CondDef> strictConditions;
private List<CondDef> conditions;
private List<CondDef> sufficientConditions;
private List<CondDef> necessaryConditions;
private List<ClusteringDef> clustering;
private Map<String, TreeNodeDef> decisionTree;
private Map<String, List<String>> blacklists;
@JsonIgnore
@ -46,30 +50,30 @@ public class PaceConfig implements Serializable {
this.model = model;
}
public List<CondDef> getStrictConditions() {
return strictConditions;
public List<CondDef> getSufficientConditions() {
return sufficientConditions;
}
public void setStrictConditions(final List<CondDef> strictConditions) {
this.strictConditions = strictConditions;
public void setSufficientConditions(final List<CondDef> sufficientConditions) {
this.sufficientConditions = sufficientConditions;
}
public List<CondDef> getConditions() {
return conditions;
public List<CondDef> getNecessaryConditions() {
return necessaryConditions;
}
@JsonIgnore
public List<ConditionAlgo> getConditionAlgos() {
return asConditionAlgos(getConditions());
return asConditionAlgos(getNecessaryConditions());
}
@JsonIgnore
public List<ConditionAlgo> getStrictConditionAlgos() {
return asConditionAlgos(getStrictConditions());
return asConditionAlgos(getSufficientConditions());
}
public void setConditions(final List<CondDef> conditions) {
this.conditions = conditions;
public void setNecessaryConditions(final List<CondDef> necessaryConditions) {
this.necessaryConditions = necessaryConditions;
}
public List<ClusteringDef> getClustering() {
@ -80,6 +84,14 @@ public class PaceConfig implements Serializable {
this.clustering = clustering;
}
public Map<String, TreeNodeDef> getDecisionTree() {
return decisionTree;
}
public void setDecisionTree(Map<String, TreeNodeDef> decisionTree) {
this.decisionTree = decisionTree;
}
public Map<String, List<String>> getBlacklists() {
return blacklists;
}

View File

@ -1,15 +1,15 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ScoreResult;
import eu.dnetlib.pace.model.Document;
public abstract class AbstractDistance<A> implements Distance<A> {
protected abstract Document toDocument(A a);
@Override
public ScoreResult between(final A a, final A b, final Config config) {
return new DistanceScorer(config).distance(toDocument(a), toDocument(b));
}
}
//package eu.dnetlib.pace.distance;
//
//import eu.dnetlib.pace.config.Config;
//import eu.dnetlib.pace.distance.eval.ScoreResult;
//import eu.dnetlib.pace.model.Document;
//
//public abstract class AbstractDistance<A> implements Distance<A> {
//
// protected abstract Document toDocument(A a);
//
// @Override
// public boolean between(final A a, final A b, final Config config) {
// return new PairwiseComparison(config).compare(toDocument(a), toDocument(b));
// }
//}

View File

@ -1,9 +1,8 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ScoreResult;
public interface Distance<A> {
public ScoreResult between(A a, A b, Config config);
public boolean between(A a, A b, Config config);
}

View File

@ -5,7 +5,7 @@ import eu.dnetlib.pace.model.Field;
import java.util.Map;
/**
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
* Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two
* objects.
*/
public interface DistanceAlgo {

View File

@ -1,126 +0,0 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.distance.eval.DistanceEval;
import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
import eu.dnetlib.pace.distance.eval.ScoreResult;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
/**
* The distance between two documents is given by the weighted mean of the field distances
*/
public class DistanceScorer {
private static final Log log = LogFactory.getLog(DistanceScorer.class);
private Config config;
public DistanceScorer(final Config config) {
this.config = config;
}
public ScoreResult distance(final Document a, final Document b) {
final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
sr.setStrictConditions(verify(a, b, config.strictConditions()));
sr.setConditions(verify(a, b, config.conditions()));
final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model()));
for (final FieldDef fd : config.model()) {
dMap.updateDistance(fieldDistance(a, b, fd));
}
sr.setDistances(dMap);
return sr;
}
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
final ConditionEvalMap res = new ConditionEvalMap();
for (final ConditionAlgo cd : conditions) {
final ConditionEvalMap map = cd.verify(a, b);
res.mergeFrom(map);
// commented out shortcuts
/*
if (map.anyNegative()) {
return res;
}
*/
//if (strict && (res < 0)) return -1;
//cond += verify;
}
return res;
}
private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) {
final double w = fd.getWeight();
final Field va = getValue(a, fd);
final Field vb = getValue(b, fd);
final DistanceEval de = new DistanceEval(fd, va, vb);
if ((w == 0)) return de; // optimization for 0 weight
else {
if (va.isEmpty() || vb.isEmpty()) {
if (fd.isIgnoreMissing()) {
de.setDistance(-1);
} else {
de.setDistance(w);
}
} else {
if (va.getType().equals(vb.getType())) {
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
} else {
throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
}
}
return de;
}
}
private Field getValue(final Document d, final FieldDef fd) {
final Field v = d.values(fd.getName());
if (fd.getLength() > 0) {
if (v instanceof FieldValueImpl) {
((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
} else if (v instanceof FieldListImpl) {
List<String> strings = ((FieldListImpl) v).stringList();
strings = strings.stream()
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
.collect(Collectors.toList());
((FieldListImpl) v).clear();
((FieldListImpl) v).addAll(strings.stream()
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
.map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
.collect(Collectors.toList()));
}
}
return v;
}
private double sumWeights(final Collection<FieldDef> fields) {
double sum = 0.0;
for (final FieldDef fd : fields) {
sum += fd.getWeight();
}
return sum;
}
}

View File

@ -1,12 +1,12 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.model.Document;
public class PaceDocumentDistance extends AbstractDistance<Document> {
@Override
protected Document toDocument(Document a) {
return a;
}
}
//package eu.dnetlib.pace.distance;
//
//import eu.dnetlib.pace.model.Document;
//
//public class PaceDocumentDistance extends AbstractDistance<Document> {
//
// @Override
// protected Document toDocument(Document a) {
// return a;
// }
//
//}

View File

@ -0,0 +1,125 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.tree.support.MatchType;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException;
import eu.dnetlib.pace.util.Reporter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.List;
import java.util.Map;
/**
* The compare between two documents is given by the weighted mean of the field distances
*/
public class PairwiseComparison {
private static final Log log = LogFactory.getLog(PairwiseComparison.class);
private Config config;
public PairwiseComparison(final Config config) {
this.config = config;
}
public boolean compare(final MapDocument a, final MapDocument b) {
//verify sufficientConditions
if (verify(a, b, config.sufficientConditions()).result() > 0)
return true;
//verify necessaryConditions
if (verify(a, b, config.necessaryConditions()).result() < 0)
return false;
//evaluate the decision tree
return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH;
}
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
final ConditionEvalMap res = new ConditionEvalMap();
for (final ConditionAlgo cd : conditions) {
final ConditionEvalMap map = cd.verify(a, b);
res.mergeFrom(map);
// commented out shortcuts
/*
if (map.anyNegative()) {
return res;
}
*/
//if (strict && (res < 0)) return -1;
//cond += verify;
}
return res;
}
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map<String, TreeNodeDef> decisionTree){
String current = "start";
double similarity;
while (MatchType.parse(current)==MatchType.UNDEFINED) {
TreeNodeDef currentNode = decisionTree.get(current);
//throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current);
similarity = currentNode.evaluate(doc1, doc2);
if (similarity == -1) {
current = currentNode.getUndefined();
}
else if (similarity>=currentNode.getThreshold()){
current = currentNode.getPositive();
}
else {
current = currentNode.getNegative();
}
}
return MatchType.parse(current);
}
// private Field getValue(final Document d, final FieldDef fd) {
// final Field v = d.values(fd.getName());
// if (fd.getLength() > 0) {
//
// if (v instanceof FieldValueImpl) {
// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
// } else if (v instanceof FieldListImpl) {
// List<String> strings = ((FieldListImpl) v).stringList();
// strings = strings.stream()
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
// .collect(Collectors.toList());
// ((FieldListImpl) v).clear();
// ((FieldListImpl) v).addAll(strings.stream()
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
// .collect(Collectors.toList()));
// }
// }
//
// return v;
// }
//
// private double sumWeights(final Collection<FieldDef> fields) {
// double sum = 0.0;
// for (final FieldDef fd : fields) {
// sum += fd.getWeight();
// }
// return sum;
// }
}

View File

@ -12,7 +12,7 @@ import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
/**
* For the rest of the fields delegate the distance measure to the second string library.
* For the rest of the fields delegate the compare measure to the second string library.
*/
public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo {
@ -35,7 +35,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
}
/**
* Instantiates a new second string distance algo.
* Instantiates a new second string compare algo.
*
* @param weight
* the weight
@ -90,7 +90,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
* @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
public double distance(final Field a, final Field b) {

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.DistanceScorer;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

View File

@ -7,7 +7,7 @@ import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
/**
* Compared distance between two titles, ignoring version numbers. Suitable for Software entities.
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/
@DistanceClass("LevensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo {

View File

@ -7,7 +7,7 @@ import eu.dnetlib.pace.model.Field;
import java.util.Map;
/**
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
* NullDistanceAlgo.
*/
@DistanceClass("Null")

View File

@ -40,7 +40,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
@ -50,7 +50,7 @@ public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {

View File

@ -40,7 +40,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
@ -50,7 +50,7 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {

View File

@ -12,12 +12,12 @@ import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
/**
* For the rest of the fields delegate the distance measure to the second string library.
* For the rest of the fields delegate the compare measure to the second string library.
*/
public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo {
/**
* Instantiates a new sorted second string distance algo.
* Instantiates a new sorted second string compare algo.
*
* @param weight
* the weight
@ -35,7 +35,7 @@ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanc
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
*/
@Override
protected List<String> toList(final Field list) {

View File

@ -66,7 +66,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
public double distance(final Field a, final Field b) {
@ -79,7 +79,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
@ -89,7 +89,7 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {

View File

@ -1,32 +0,0 @@
package eu.dnetlib.pace.distance.eval;
import java.util.HashMap;
/**
* Created by claudio on 10/03/16.
*/
public class DistanceEvalMap extends HashMap<String, DistanceEval> {
private double sumWeights;
private double sumDistances = 0.0;
public DistanceEvalMap(final double sumWeights) {
this.sumWeights = sumWeights;
}
public void updateDistance(final DistanceEval d) {
put(d.getFieldDef().getName(), d);
if (d.getDistance() >= 0) {
sumDistances += d.getDistance();
} else {
sumWeights -= d.getFieldDef().getWeight();
}
}
public double distance() {
return sumWeights == 0 ? 0 : sumDistances / sumWeights;
}
}

View File

@ -1,62 +0,0 @@
package eu.dnetlib.pace.distance.eval;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
/**
* Created by claudio on 09/03/16.
*/
public class ScoreResult implements Serializable {
private ConditionEvalMap strictConditions;
private ConditionEvalMap conditions;
private DistanceEvalMap distances;
public double getScore() {
if (getStrictConditions().result() > 0) return 1.0;
// if (getStrictConditions().result() < 0) return 0.0;
if (getConditions().result() < 0) return 0.0;
return getDistances().distance();
}
public ConditionEvalMap getStrictConditions() {
return strictConditions;
}
public void setStrictConditions(final ConditionEvalMap strictConditions) {
this.strictConditions = strictConditions;
}
public ConditionEvalMap getConditions() {
return conditions;
}
public void setConditions(final ConditionEvalMap conditions) {
this.conditions = conditions;
}
public DistanceEvalMap getDistances() {
return distances;
}
public void setDistances(final DistanceEvalMap distances) {
this.distances = distances;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
}
}
}

View File

@ -14,25 +14,29 @@ import java.util.List;
import java.util.Map;
/**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
*/
public class FieldDef implements Serializable {
public final static String PATH_SEPARATOR = "/";
private String algo;
private String name;
private String path;
private boolean ignoreMissing;
private Type type;
private boolean overrideMatch;
private boolean ignoreMissing;
private double weight;
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
private boolean overrideMatch;
/**
* Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
@ -74,20 +78,6 @@ public class FieldDef implements Serializable {
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
}
public DistanceAlgo distanceAlgo() {
if (params == null) {
params = new HashMap<>();
}
params.put("weight", getWeight());
return PaceConfig.resolver.getDistanceAlgo(getAlgo(), params);
}
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public Type getType() {
return type;
}
@ -104,23 +94,6 @@ public class FieldDef implements Serializable {
this.overrideMatch = overrideMatch;
}
public double getWeight() {
return weight;
}
public void setWeight(final double weight) {
this.weight = weight;
}
public String getAlgo() {
return algo;
}
public void setAlgo(final String algo) {
this.algo = algo;
}
public int getSize() {
return size;
}
@ -153,10 +126,6 @@ public class FieldDef implements Serializable {
this.path = path;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
@Override
public String toString() {
return new Gson().toJson(this);

View File

@ -0,0 +1,42 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("alwaysMatch")
public class AlwaysMatch extends AbstractComparator {
public AlwaysMatch(final Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return 1.0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("exactMatch")
public class ExactMatch extends AbstractComparator {
public ExactMatch(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.io.Serializable;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinkler")
public class JaroWinkler extends AbstractComparator {
public JaroWinkler(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,78 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractComparator {
private Map<String, Number> params;
public JaroWinklerNormalizedName(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
if (sameCity(cities1,cities2)) {
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
}
return 0.0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinklerTitle")
public class JaroWinklerTitle extends AbstractComparator {
public JaroWinklerTitle(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinklerTitle(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
boolean check = checkNumbers(ca, cb);
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinkler")
public class Level2JaroWinkler extends AbstractComparator {
public Level2JaroWinkler(Map<String, Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinkler(double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}
protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends AbstractComparator {
public Level2JaroWinklerTitle(Map<String,Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinklerTitle(final double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
final String ca = cleanup(a);
final String cb = cleanup(b);
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return ssalgo.score(cca, ccb);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2Levenstein")
public class Level2Levenstein extends AbstractComparator {
public Level2Levenstein(Map<String,Number> params){
super(params, new com.wcohen.ss.Level2Levenstein());
}
public Level2Levenstein(double w) {
super(w, new com.wcohen.ss.Level2Levenstein());
}
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("levenstein")
public class Levenstein extends AbstractComparator {
public Levenstein(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public Levenstein(double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected Levenstein(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,59 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Map;
@ComparatorClass("levensteinTitle")
public class LevensteinTitle extends AbstractComparator {
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
public LevensteinTitle(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitle(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
final String ca = cleanup(a);
final String cb = cleanup(b);
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,60 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/
@ComparatorClass("levensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends AbstractComparator {
public LevensteinTitleIgnoreVersion(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitleIgnoreVersion(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("mustBeDifferent")
public class MustBeDifferent extends AbstractComparator {
public MustBeDifferent(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public MustBeDifferent(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return !a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
* NullDistanceAlgo.
*/
@ComparatorClass("null")
public class NullDistanceAlgo implements Comparator {
public NullDistanceAlgo(Map<String, Number> params){
}
@Override
public double compare(Field a, Field b) {
return 0;
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ComparatorClass("sortedJaroWinkler")
public class SortedJaroWinkler extends AbstractSortedComparator {
public SortedJaroWinkler(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
*/
public SortedJaroWinkler(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ComparatorClass("sortedLevel2JaroWinkler")
public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
*/
public SortedLevel2JaroWinkler(final double weight) {
super(weight, new com.wcohen.ss.Level2JaroWinkler());
}
public SortedLevel2JaroWinkler(final Map<String, Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,99 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang.StringUtils;
import java.util.Map;
/**
* The Class SubStringLevenstein.
*/
@ComparatorClass("subStringLevenstein")
public class SubStringLevenstein extends AbstractComparator {
/** The limit. */
protected int limit;
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
*/
public SubStringLevenstein(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
public SubStringLevenstein(Map<String, Number> params){
super(params, new com.wcohen.ss.Levenstein());
this.limit = params.get("limit").intValue();
}
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
* @param limit
* the limit
*/
public SubStringLevenstein(final double w, final int limit) {
super(w, new com.wcohen.ss.Levenstein());
this.limit = limit;
}
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
* @param limit
* the limit
* @param ssalgo
* the ssalgo
*/
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
this.limit = limit;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
public double compare(final Field a, final Field b) {
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,60 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang.StringUtils;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
@ComparatorClass("urlMatcher")
public class UrlMatcher extends Levenstein {
private Map<String, Number> params;
public UrlMatcher(Map<String, Number> params){
super(params);
this.params = params;
}
public UrlMatcher(double weight, Map<String, Number> params) {
super(weight);
this.params = params;
}
public void setParams(Map<String, Number> params) {
this.params = params;
}
@Override
public double compare(Field a, Field b) {
final URL urlA = asUrl(getFirstValue(a));
final URL urlB = asUrl(getFirstValue(b));
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
return 0.0;
}
Double hostW = params.get("host").doubleValue();
Double pathW = params.get("path").doubleValue();
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
return hostW * 0.5;
}
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
}
private URL asUrl(final String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
}

View File

@ -0,0 +1,110 @@
package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import java.util.List;
import java.util.Map;
public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator {
/** The ssalgo. */
protected AbstractStringDistance ssalgo;
/** The weight. */
protected double weight = 0.0;
private Map<String, Number> params;
protected AbstractComparator(Map<String, Number> params, final AbstractStringDistance ssalgo){
this.params = params;
this.weight = 1.0;
this.ssalgo = ssalgo;
}
/**
* Instantiates a new second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo;
this.weight = weight;
}
protected AbstractComparator(final AbstractStringDistance ssalgo){
this.ssalgo = ssalgo;
}
/**
* Normalize.
*
* @param d
* the d
* @return the double
*/
protected abstract double normalize(double d);
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
public double distance(final String a, final String b) {
if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing
}
double score = ssalgo.score(a, b);
return normalize(score);
}
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
protected double distance(final List<String> a, final List<String> b) {
return distance(concat(a), concat(b));
}
@Override
public double compare(final Field a, final Field b) {
if (a.isEmpty() || b.isEmpty())
return -1;
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
/**
* To list.
*
* @param list
* the list
* @return the list
*/
protected List<String> toList(final Field list) {
return ((FieldList) list).stringList();
}
public double getWeight(){
return this.weight;
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public abstract class AbstractSortedComparator extends AbstractComparator {
/**
* Instantiates a new sorted second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected AbstractSortedComparator(final Map<String, Number> params, final AbstractStringDistance ssalgo){
super(params.get("weight").doubleValue(), ssalgo);
}
@Override
protected List<String> toList(final Field list) {
FieldList fl = (FieldList) list;
List<String> values = Lists.newArrayList(fl.stringList());
Collections.sort(values);
return values;
}
}

View File

@ -0,0 +1,22 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
public enum AggType {
WEIGHTED_MEAN,
AVG,
SUM,
MAX,
MIN;
public static AggType getEnum(String value) {
try {
return AggType.valueOf(value);
}
catch (IllegalArgumentException e) {
throw new PaceException("Undefined aggregation type", e);
}
}
}

View File

@ -0,0 +1,9 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.model.Field;
public interface Comparator {
public double compare(Field a, Field b);
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.pace.tree.support;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ComparatorClass {
public String value();
}

View File

@ -1,4 +1,4 @@
package eu.dnetlib.pace.model;
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
@ -14,14 +14,25 @@ public class FieldConf implements Serializable {
private double weight = 1.0; //weight for the field (to be used in the aggregation)
private Map<String,Number> params; //parameters
private boolean ignoreMissing;
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
public FieldConf() {
}
public FieldConf(String field, String comparator, double weight, Map<String, Number> params) {
public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean ignoreMissing) {
this.field = field;
this.comparator = comparator;
this.weight = weight;
this.params = params;
this.ignoreMissing = ignoreMissing;
}
public String getField() {

View File

@ -0,0 +1,18 @@
package eu.dnetlib.pace.tree.support;
public enum MatchType {
MATCH,
NO_MATCH,
UNDEFINED;
public static MatchType parse(String value) {
try {
return MatchType.valueOf(value);
}
catch (IllegalArgumentException e) {
return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
}
}
}

View File

@ -0,0 +1,157 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
public class TreeNodeDef implements Serializable {
private List<FieldConf> fields;
private AggType aggregation;
private double threshold;
private String positive;
private String negative;
private String undefined;
boolean ignoreMissing;
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) {
this.fields = fields;
this.aggregation = aggregation;
this.threshold = threshold;
this.positive = positive;
this.negative = negative;
this.undefined = undefined;
this.ignoreMissing = ignoreMissing;
}
public TreeNodeDef() {
}
public double evaluate(MapDocument doc1, MapDocument doc2) {
DescriptiveStatistics stats = new DescriptiveStatistics();
double sumWeights = 0.0; //for the weighted mean
int missCount = 0; //counter for the number of misses
for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight();
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
if (result >= 0.0) { //if the field is not missing
stats.addValue(weight * result);
sumWeights += weight; //sum weights, to be used in case of weighted mean
}
else { //if the field is missing
missCount += 1;
if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored
stats.addValue(weight * 0);
sumWeights += weight;
}
}
}
//global ignoremissing (if one of the field is missing, return undefined)
if (!ignoreMissing && missCount>0) {
return -1;
}
switch (aggregation){
case AVG:
return stats.getMean();
case SUM:
return stats.getSum();
case MAX:
return stats.getMax();
case MIN:
return stats.getMin();
case WEIGHTED_MEAN:
return stats.getSum()/sumWeights;
default:
return 0.0;
}
}
private Comparator comparator(final FieldConf field){
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
}
public List<FieldConf> getFields() {
return fields;
}
public void setFields(List<FieldConf> fields) {
this.fields = fields;
}
public AggType getAggregation() {
return aggregation;
}
public void setAggregation(AggType aggregation) {
this.aggregation = aggregation;
}
public double getThreshold() {
return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public String getPositive() {
return positive;
}
public void setPositive(String positive) {
this.positive = positive;
}
public String getNegative() {
return negative;
}
public void setNegative(String negative) {
this.negative = negative;
}
public String getUndefined() {
return undefined;
}
public void setUndefined(String undefined) {
this.undefined = undefined;
}
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -4,8 +4,8 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.distance.PaceDocumentDistance;
import eu.dnetlib.pace.distance.eval.ScoreResult;
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
import eu.dnetlib.pace.distance.PairwiseComparison;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentComparator;
@ -116,7 +116,7 @@ public class BlockProcessor {
private void process(final Queue<MapDocument> queue, final Reporter context) {
final PaceDocumentDistance algo = new PaceDocumentDistance();
// final PaceDocumentDistance algo = new PaceDocumentDistance();
while (!queue.isEmpty()) {
@ -150,21 +150,23 @@ public class BlockProcessor {
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
final ScoreResult sr = similarity(algo, pivot, curr);
// log.info(sr.toString()+"SCORE "+ sr.getScore());
emitOutput(sr, idPivot, idCurr, context);
i++;
final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf);
emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context);
// final ScoreResult sr = similarity(algo, pivot, curr);
//// log.info(sr.toString()+"SCORE "+ sr.getScore());
// emitOutput(sr, idPivot, idCurr, context);
// i++;
}
}
}
}
}
private void emitOutput(final ScoreResult sr, final String idPivot, final String idCurr, final Reporter context) {
final double d = sr.getScore();
if (d >= dedupConf.getWf().getThreshold()) {
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
if (result) {
writeSimilarity(context, idPivot, idCurr);
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
} else {
@ -172,15 +174,6 @@ public class BlockProcessor {
}
}
private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
try {
return algo.between(a, b, dedupConf);
} catch(Throwable e) {
log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
throw new IllegalArgumentException(e);
}
}
private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
}

View File

@ -1390,7 +1390,7 @@ public class DiffPatchMatch {
}
/**
* Compute the Levenshtein distance; the number of inserted, deleted or
* Compute the Levenshtein compare; the number of inserted, deleted or
* substituted characters.
* @param diffs List of Diff objects.
* @return Number of changes.
@ -1655,7 +1655,7 @@ public class DiffPatchMatch {
score_threshold = score;
best_loc = j - 1;
if (best_loc > loc) {
// When passing loc, don't exceed our current distance from loc.
// When passing loc, don't exceed our current compare from loc.
start = Math.max(1, 2 * loc - best_loc);
} else {
// Already passed loc, downhill from here on in.

View File

@ -7,6 +7,8 @@ import eu.dnetlib.pace.condition.ConditionClass;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.reflections.Reflections;
import java.io.Serializable;
@ -19,11 +21,13 @@ public class PaceResolver implements Serializable {
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition");
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.distance.algo");
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo");
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
private final Map<String, Class<Comparator>> comparators;
public PaceResolver() {
@ -38,6 +42,10 @@ public class PaceResolver implements Serializable {
this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream()
.filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
.filter(Comparator.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
}
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
@ -64,4 +72,12 @@ public class PaceResolver implements Serializable {
}
}
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
try {
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
throw new PaceException(name + " not found ", e);
}
}
}

View File

@ -16,9 +16,9 @@
"pace" : {
"clustering" : [
],
"strictConditions" : [
"sufficientConditions" : [
],
"conditions" : [
"necessaryConditions" : [
],
"model" : [
],

View File

@ -16,10 +16,10 @@
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
],
"strictConditions" : [
"sufficientConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
],
"conditions" : [
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] },
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
],