forked from D-Net/dnet-hadoop
implementation of the conditions in tree nodes. get rid of the conditions part of the configuration
This commit is contained in:
parent
a5c5d2f01b
commit
d71dae5fd2
|
@ -15,10 +15,7 @@ import org.apache.commons.lang.StringUtils;
|
|||
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Set of common functions
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Abstract necessaryConditions needs a list of field names.
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
*/
|
||||
public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo {
|
||||
|
||||
protected String cond;
|
||||
|
||||
protected List<FieldDef> fields;
|
||||
|
||||
public AbstractCondition(final String cond, final List<FieldDef> fields) {
|
||||
this.cond = cond;
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
|
||||
|
||||
@Override
|
||||
public ConditionEvalMap verify(final Document a, final Document b) {
|
||||
final ConditionEvalMap res = new ConditionEvalMap();
|
||||
for (final FieldDef fd : getFields()) {
|
||||
|
||||
final Field va = a.values(fd.getName());
|
||||
final Field vb = b.values(fd.getName());
|
||||
|
||||
if (fd.isIgnoreMissing()) {
|
||||
res.put(fd.getName(), verify(fd, va, vb));
|
||||
} else {
|
||||
if (va.isEmpty() || vb.isEmpty()) {
|
||||
res.put(fd.getName(), new ConditionEval(cond, va, vb, -1));
|
||||
} else {
|
||||
res.put(fd.getName(), verify(fd, va, vb));
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
public List<FieldDef> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Default always true condition
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("alwaystruecondition")
|
||||
public class AlwaysTrueCondition extends AbstractCondition {
|
||||
|
||||
public AlwaysTrueCondition(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
return new ConditionEval(cond, a, b, 1);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Allows to express general necessaryConditions to be satisfied or not between two Documents.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public interface ConditionAlgo {
|
||||
|
||||
/**
|
||||
* Verify a condition.
|
||||
*
|
||||
* @param a
|
||||
* the Document a
|
||||
* @param b
|
||||
* the Document b
|
||||
* @return 0 when condition cannot be verified (ignoremissing = true). Positive int when the condition is verified. Negative int when
|
||||
* the condition is not verified.
|
||||
*/
|
||||
public abstract ConditionEvalMap verify(Document a, Document b);
|
||||
|
||||
}
|
|
@ -1,13 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface ConditionClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -1,27 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* The Class ExactMatch.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("doiExactMatch")
|
||||
public class DoiExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
public DoiExactMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getValue(final Field f) {
|
||||
return super.getValue(f).replaceAll(PREFIX, "");
|
||||
}
|
||||
|
||||
}
|
|
@ -1,32 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
|
||||
@ConditionClass("DomainExactMatch")
|
||||
public class DomainExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public DomainExactMatch(String cond, List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getValue(final Field f) {
|
||||
return asUrl(super.getValue(f)).getHost();
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) {
|
||||
try {
|
||||
if (value.isEmpty())
|
||||
return new URL("http://");
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
/**
|
||||
* The Class ExactMatch.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("exactMatch")
|
||||
public class ExactMatch extends AbstractCondition {
|
||||
|
||||
public ExactMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
||||
int res;
|
||||
|
||||
// if (StringUtils.isBlank(fa) && StringUtils.isBlank(fb)) {
|
||||
// res = 0;
|
||||
// } else {
|
||||
// res = fa.equals(fb) ? 1 : -1;
|
||||
// }
|
||||
|
||||
//if there is a blank, undefined result
|
||||
if (StringUtils.isBlank(fa) || StringUtils.isBlank(fb)) {
|
||||
res = 0;
|
||||
} else {
|
||||
res = fa.equals(fb) ? 1 : -1;
|
||||
}
|
||||
|
||||
return new ConditionEval(cond, a, b, res);
|
||||
}
|
||||
|
||||
protected String getValue(final Field f) {
|
||||
return getFirstValue(f);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
/**
|
||||
* The Class ExactMatch.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("exactMatchIgnoreCase")
|
||||
public class ExactMatchIgnoreCase extends AbstractCondition {
|
||||
|
||||
public ExactMatchIgnoreCase(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
||||
int res;
|
||||
|
||||
if (StringUtils.isBlank(fa) || StringUtils.isBlank(fb)) {
|
||||
res = 0;
|
||||
} else {
|
||||
res = fa.equalsIgnoreCase(fb) ? 1 : -1;
|
||||
}
|
||||
|
||||
return new ConditionEval(cond, a, b, res);
|
||||
}
|
||||
|
||||
protected String getValue(final Field f) {
|
||||
return getFirstValue(f);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Returns true if the field values are different.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("mustBeDifferent")
|
||||
public class MustBeDifferent extends AbstractCondition {
|
||||
|
||||
/**
|
||||
* Instantiates a new size match.
|
||||
*
|
||||
* @param fields the fields
|
||||
*/
|
||||
public MustBeDifferent(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List)
|
||||
*/
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
||||
return new ConditionEval(cond, a, b, fa.equals(fb) ? -1 : 1);
|
||||
|
||||
}
|
||||
|
||||
protected String getValue(final Field f) {
|
||||
return getFirstValue(f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if is empty.
|
||||
*
|
||||
* @param a the a
|
||||
* @return true, if is empty
|
||||
*/
|
||||
protected boolean isEmpty(final Iterable<?> a) {
|
||||
return (a == null) || Iterables.isEmpty(a);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,63 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.adaptor.Pid;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* The Class PidMatch.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("pidMatch")
|
||||
public class PidMatch extends AbstractCondition {
|
||||
|
||||
private static final Log log = LogFactory.getLog(PidMatch.class);
|
||||
|
||||
public PidMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
|
||||
final List<String> sa = ((FieldList) a).stringList();
|
||||
final List<String> sb = ((FieldList) b).stringList();
|
||||
|
||||
final List<Pid> pal = Pid.fromOafJson(sa);
|
||||
final List<Pid> pbl = Pid.fromOafJson(sb);
|
||||
|
||||
final Set<String> pidAset = toHashSet(pal);
|
||||
final Set<String> pidBset = toHashSet(pbl);
|
||||
|
||||
int incommon = Sets.intersection(pidAset, pidBset).size();
|
||||
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return new ConditionEval(cond, a, b, 0);
|
||||
}
|
||||
|
||||
int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1;
|
||||
|
||||
return new ConditionEval(cond, a, b, result);
|
||||
}
|
||||
|
||||
//lowercase + normalization of the pid before adding it to the set
|
||||
private Set<String> toHashSet(List<Pid> pbl) {
|
||||
|
||||
return pbl.stream()
|
||||
.map(pid -> pid.getType() + normalizePid(pid.getValue()))
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Returns true if the number of values in the fields is the same.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("sizeMatch")
|
||||
public class SizeMatch extends AbstractCondition {
|
||||
|
||||
/**
|
||||
* Instantiates a new size match.
|
||||
*
|
||||
* @param fields
|
||||
* the fields
|
||||
*/
|
||||
public SizeMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List)
|
||||
*/
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
|
||||
// if (a.isEmpty() & b.isEmpty()) return 1;
|
||||
//
|
||||
// if (a.isEmpty()) return -1;
|
||||
// if (b.isEmpty()) return -1;
|
||||
|
||||
return new ConditionEval(cond, a, b, Iterables.size(a) == Iterables.size(b) ? 1 : -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if is empty.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @return true, if is empty
|
||||
*/
|
||||
protected boolean isEmpty(final Iterable<?> a) {
|
||||
return (a == null) || Iterables.isEmpty(a);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Returns true if the titles in the given documents contains the same numbers, false otherwise.
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
*/
|
||||
@ConditionClass("titleVersionMatch")
|
||||
public class TitleVersionMatch extends AbstractCondition {
|
||||
|
||||
public TitleVersionMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
final String valueA = getFirstValue(a);
|
||||
final String valueB = getFirstValue(b);
|
||||
|
||||
return new ConditionEval(cond, a, b, notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : -1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.time.Year;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("yearMatch")
|
||||
public class YearMatch extends AbstractCondition {
|
||||
|
||||
private int limit = 4;
|
||||
|
||||
public YearMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
// @Override
|
||||
// public boolean verify(final Document a, final Document b) {
|
||||
// boolean res = true;
|
||||
// for (FieldDef fd : getFields()) {
|
||||
//
|
||||
// }
|
||||
//
|
||||
// return res;
|
||||
// }
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
final String valueA = getNumbers(getFirstValue(a));
|
||||
final String valueB = getNumbers(getFirstValue(b));
|
||||
|
||||
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
|
||||
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
|
||||
|
||||
return new ConditionEval(cond, a, b, lengthMatch && valueA.equals(valueB) || onemissing ? 1 : -1);
|
||||
}
|
||||
|
||||
protected boolean checkLength(final String s) {
|
||||
return s.length() == limit;
|
||||
}
|
||||
|
||||
protected String getFirstValue(final Field value) {
|
||||
return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -22,7 +22,11 @@ public interface Config {
|
|||
*/
|
||||
public List<FieldDef> model();
|
||||
|
||||
|
||||
/**
|
||||
* Decision Tree definition
|
||||
*
|
||||
* @return the map representing the decision tree
|
||||
*/
|
||||
public Map<String, TreeNodeDef> decisionTree();
|
||||
|
||||
/**
|
||||
|
@ -32,20 +36,6 @@ public interface Config {
|
|||
*/
|
||||
public Map<String, FieldDef> modelMap();
|
||||
|
||||
/**
|
||||
* Strict Pre-Condition definitions.
|
||||
*
|
||||
* @return the list of necessaryConditions
|
||||
*/
|
||||
public List<ConditionAlgo> sufficientConditions();
|
||||
|
||||
/**
|
||||
* Pre-Condition definitions.
|
||||
*
|
||||
* @return the list of necessaryConditions
|
||||
*/
|
||||
public List<ConditionAlgo> necessaryConditions();
|
||||
|
||||
/**
|
||||
* Clusterings.
|
||||
*
|
||||
|
|
|
@ -130,16 +130,6 @@ public class DedupConfig implements Config, Serializable {
|
|||
return getPace().getModelMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ConditionAlgo> sufficientConditions() {
|
||||
return getPace().getStrictConditionAlgos();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ConditionAlgo> necessaryConditions() {
|
||||
return getPace().getConditionAlgos();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ClusteringDef> clusterings() {
|
||||
return getPace().getClustering();
|
||||
|
|
|
@ -1,27 +1,20 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.CondDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PaceConfig implements Serializable {
|
||||
|
||||
private List<FieldDef> model;
|
||||
|
||||
private List<CondDef> sufficientConditions;
|
||||
private List<CondDef> necessaryConditions;
|
||||
private List<ClusteringDef> clustering;
|
||||
private Map<String, TreeNodeDef> decisionTree;
|
||||
|
||||
|
@ -50,32 +43,6 @@ public class PaceConfig implements Serializable {
|
|||
this.model = model;
|
||||
}
|
||||
|
||||
public List<CondDef> getSufficientConditions() {
|
||||
return sufficientConditions;
|
||||
}
|
||||
|
||||
public void setSufficientConditions(final List<CondDef> sufficientConditions) {
|
||||
this.sufficientConditions = sufficientConditions;
|
||||
}
|
||||
|
||||
public List<CondDef> getNecessaryConditions() {
|
||||
return necessaryConditions;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public List<ConditionAlgo> getConditionAlgos() {
|
||||
return asConditionAlgos(getNecessaryConditions());
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public List<ConditionAlgo> getStrictConditionAlgos() {
|
||||
return asConditionAlgos(getSufficientConditions());
|
||||
}
|
||||
|
||||
public void setNecessaryConditions(final List<CondDef> necessaryConditions) {
|
||||
this.necessaryConditions = necessaryConditions;
|
||||
}
|
||||
|
||||
public List<ClusteringDef> getClustering() {
|
||||
return clustering;
|
||||
}
|
||||
|
@ -108,18 +75,4 @@ public class PaceConfig implements Serializable {
|
|||
this.modelMap = modelMap;
|
||||
}
|
||||
|
||||
// helper
|
||||
|
||||
private List<ConditionAlgo> asConditionAlgos(final List<CondDef> defs) {
|
||||
final List<ConditionAlgo> algos = Lists.newArrayList();
|
||||
if (CollectionUtils.isEmpty(defs)) return algos;
|
||||
for (final CondDef cd : defs) {
|
||||
final List<FieldDef> fields = getModel().stream()
|
||||
.filter(fd -> cd.getFields().contains(fd.getName()))
|
||||
.collect(Collectors.toList());
|
||||
algos.add(cd.conditionAlgo(fields));
|
||||
}
|
||||
return algos;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,15 +0,0 @@
|
|||
//package eu.dnetlib.pace.distance;
|
||||
//
|
||||
//import eu.dnetlib.pace.config.Config;
|
||||
//import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||
//import eu.dnetlib.pace.model.Document;
|
||||
//
|
||||
//public abstract class AbstractDistance<A> implements Distance<A> {
|
||||
//
|
||||
// protected abstract Document toDocument(A a);
|
||||
//
|
||||
// @Override
|
||||
// public boolean between(final A a, final A b, final Config config) {
|
||||
// return new PairwiseComparison(config).compare(toDocument(a), toDocument(b));
|
||||
// }
|
||||
//}
|
|
@ -1,26 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
|
||||
public abstract class ConfigurableDistanceAlgo extends AbstractPaceFunctions {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
private double weigth;
|
||||
|
||||
public ConfigurableDistanceAlgo(final Map<String, String> params, final double weight) {
|
||||
this.params = params;
|
||||
this.weigth = weight;
|
||||
}
|
||||
|
||||
public Map<String, String> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public double getWeigth() {
|
||||
return weigth;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
public interface Distance<A> {
|
||||
|
||||
public boolean between(A a, A b, Config config);
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two
|
||||
* objects.
|
||||
*/
|
||||
public interface DistanceAlgo {
|
||||
|
||||
public abstract double distance(Field a, Field b);
|
||||
|
||||
public double getWeight();
|
||||
|
||||
}
|
|
@ -1,13 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface DistanceClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -1,12 +0,0 @@
|
|||
//package eu.dnetlib.pace.distance;
|
||||
//
|
||||
//import eu.dnetlib.pace.model.Document;
|
||||
//
|
||||
//public class PaceDocumentDistance extends AbstractDistance<Document> {
|
||||
//
|
||||
// @Override
|
||||
// protected Document toDocument(Document a) {
|
||||
// return a;
|
||||
// }
|
||||
//
|
||||
//}
|
|
@ -1,125 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import eu.dnetlib.pace.tree.support.MatchType;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import eu.dnetlib.pace.util.Reporter;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The compare between two documents is given by the weighted mean of the field distances
|
||||
*/
|
||||
public class PairwiseComparison {
|
||||
|
||||
private static final Log log = LogFactory.getLog(PairwiseComparison.class);
|
||||
|
||||
private Config config;
|
||||
|
||||
public PairwiseComparison(final Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public boolean compare(final MapDocument a, final MapDocument b) {
|
||||
|
||||
//verify sufficientConditions
|
||||
if (verify(a, b, config.sufficientConditions()).result() > 0)
|
||||
return true;
|
||||
|
||||
//verify necessaryConditions
|
||||
if (verify(a, b, config.necessaryConditions()).result() < 0)
|
||||
return false;
|
||||
|
||||
//evaluate the decision tree
|
||||
return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH;
|
||||
}
|
||||
|
||||
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
|
||||
final ConditionEvalMap res = new ConditionEvalMap();
|
||||
|
||||
for (final ConditionAlgo cd : conditions) {
|
||||
final ConditionEvalMap map = cd.verify(a, b);
|
||||
res.mergeFrom(map);
|
||||
|
||||
// commented out shortcuts
|
||||
/*
|
||||
if (map.anyNegative()) {
|
||||
return res;
|
||||
}
|
||||
*/
|
||||
|
||||
//if (strict && (res < 0)) return -1;
|
||||
//cond += verify;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map<String, TreeNodeDef> decisionTree){
|
||||
|
||||
String current = "start";
|
||||
double similarity;
|
||||
|
||||
while (MatchType.parse(current)==MatchType.UNDEFINED) {
|
||||
|
||||
TreeNodeDef currentNode = decisionTree.get(current);
|
||||
//throw an exception if the node doesn't exist
|
||||
if (currentNode == null)
|
||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
||||
|
||||
similarity = currentNode.evaluate(doc1, doc2);
|
||||
|
||||
if (similarity == -1) {
|
||||
current = currentNode.getUndefined();
|
||||
}
|
||||
else if (similarity>=currentNode.getThreshold()){
|
||||
current = currentNode.getPositive();
|
||||
}
|
||||
else {
|
||||
current = currentNode.getNegative();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return MatchType.parse(current);
|
||||
}
|
||||
|
||||
// private Field getValue(final Document d, final FieldDef fd) {
|
||||
// final Field v = d.values(fd.getName());
|
||||
// if (fd.getLength() > 0) {
|
||||
//
|
||||
// if (v instanceof FieldValueImpl) {
|
||||
// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
|
||||
// } else if (v instanceof FieldListImpl) {
|
||||
// List<String> strings = ((FieldListImpl) v).stringList();
|
||||
// strings = strings.stream()
|
||||
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
// .collect(Collectors.toList());
|
||||
// ((FieldListImpl) v).clear();
|
||||
// ((FieldListImpl) v).addAll(strings.stream()
|
||||
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||
// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
|
||||
// .collect(Collectors.toList()));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return v;
|
||||
// }
|
||||
//
|
||||
// private double sumWeights(final Collection<FieldDef> fields) {
|
||||
// double sum = 0.0;
|
||||
// for (final FieldDef fd : fields) {
|
||||
// sum += fd.getWeight();
|
||||
// }
|
||||
// return sum;
|
||||
// }
|
||||
|
||||
}
|
|
@ -1,114 +0,0 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
/**
|
||||
* For the rest of the fields delegate the compare measure to the second string library.
|
||||
*/
|
||||
public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo {
|
||||
|
||||
// val aliases = Map(('â‚' to '₉') zip ('1' to '9'): _*) ++ Map(('â´' to 'â¹') zip ('4' to '9'): _*) ++ Map('¹' -> '1', '²' ->
|
||||
// '2', * '³'
|
||||
// -> '3')
|
||||
|
||||
/** The ssalgo. */
|
||||
protected AbstractStringDistance ssalgo;
|
||||
|
||||
/** The weight. */
|
||||
protected double weight = 0.0;
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
protected SecondStringDistanceAlgo(Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||
this.params = params;
|
||||
this.weight = params.get("weight").doubleValue();
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) {
|
||||
this.ssalgo = ssalgo;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize.
|
||||
*
|
||||
* @param d
|
||||
* the d
|
||||
* @return the double
|
||||
*/
|
||||
protected abstract double normalize(double d);
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
public double distance(final String a, final String b) {
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
protected double distance(final List<String> a, final List<String> b) {
|
||||
return distance(concat(a), concat(b));
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* To list.
|
||||
*
|
||||
* @param list
|
||||
* the list
|
||||
* @return the list
|
||||
*/
|
||||
protected List<String> toList(final Field list) {
|
||||
return ((FieldList) list).stringList();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,39 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("AlwaysMatch")
|
||||
public class AlwaysMatch extends SecondStringDistanceAlgo {
|
||||
|
||||
public AlwaysMatch(final Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public AlwaysMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,39 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("ExactMatch")
|
||||
public class ExactMatch extends SecondStringDistanceAlgo {
|
||||
|
||||
public ExactMatch(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public ExactMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@DistanceClass("JaroWinkler")
|
||||
public class JaroWinkler extends SecondStringDistanceAlgo {
|
||||
|
||||
public JaroWinkler(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinkler(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,76 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@DistanceClass("JaroWinklerNormalizedName")
|
||||
public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
public JaroWinklerNormalizedName(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerNormalizedName(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
if (sameCity(cities1,cities2)) {
|
||||
|
||||
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
cb = removeKeywords(cb, keywords2);
|
||||
cb = removeKeywords(cb, cities2);
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca,cb));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@DistanceClass("JaroWinklerTitle")
|
||||
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
|
||||
|
||||
public JaroWinklerTitle(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinklerTitle(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
boolean check = checkNumbers(ca, cb);
|
||||
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,34 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("Level2JaroWinkler")
|
||||
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
|
||||
|
||||
public Level2JaroWinkler(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public Level2JaroWinkler(double w) {
|
||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("Level2JaroWinklerTitle")
|
||||
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
|
||||
|
||||
public Level2JaroWinklerTitle(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public Level2JaroWinklerTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return ssalgo.score(cca, ccb);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,34 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("Level2Levenstein")
|
||||
public class Level2Levenstein extends SecondStringDistanceAlgo {
|
||||
|
||||
public Level2Levenstein(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
public Level2Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,34 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("Levenstein")
|
||||
public class Levenstein extends SecondStringDistanceAlgo {
|
||||
|
||||
public Levenstein(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected Levenstein(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("LevensteinTitle")
|
||||
public class LevensteinTitle extends SecondStringDistanceAlgo {
|
||||
|
||||
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
||||
|
||||
public LevensteinTitle(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
return 1 - (Math.abs(score) / Math.max(la, lb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,58 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
*/
|
||||
@DistanceClass("LevensteinTitleIgnoreVersion")
|
||||
public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo {
|
||||
|
||||
public LevensteinTitleIgnoreVersion(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitleIgnoreVersion(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
|
||||
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
return 1 - (Math.abs(score) / Math.max(la, lb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,39 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("MustBeDifferent")
|
||||
public class MustBeDifferent extends SecondStringDistanceAlgo {
|
||||
|
||||
public MustBeDifferent(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public MustBeDifferent(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
return !a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,29 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
|
||||
* NullDistanceAlgo.
|
||||
*/
|
||||
@DistanceClass("Null")
|
||||
public class NullDistanceAlgo implements DistanceAlgo {
|
||||
|
||||
public NullDistanceAlgo(Map<String, Number> params){
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(Field a, Field b) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@DistanceClass("SortedJaroWinkler")
|
||||
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
||||
|
||||
public SortedJaroWinkler(Map<String,Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
*/
|
||||
public SortedJaroWinkler(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@DistanceClass("SortedLevel2JaroWinkler")
|
||||
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
*/
|
||||
public SortedLevel2JaroWinkler(final double weight) {
|
||||
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public SortedLevel2JaroWinkler(final Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
/**
|
||||
* For the rest of the fields delegate the compare measure to the second string library.
|
||||
*/
|
||||
public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedSecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
protected SortedSecondStringDistanceAlgo(final Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||
super(params.get("weight").doubleValue(), ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
protected List<String> toList(final Field list) {
|
||||
FieldList fl = (FieldList) list;
|
||||
List<String> values = Lists.newArrayList(fl.stringList());
|
||||
Collections.sort(values);
|
||||
return values;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,99 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SubStringLevenstein.
|
||||
*/
|
||||
@DistanceClass("SubStringLevenstein")
|
||||
public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
||||
|
||||
/** The limit. */
|
||||
protected int limit;
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
*/
|
||||
public SubStringLevenstein(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public SubStringLevenstein(Map<String, Number> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
this.limit = params.get("limit").intValue();
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
* @param limit
|
||||
* the limit
|
||||
*/
|
||||
public SubStringLevenstein(final double w, final int limit) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
* @param limit
|
||||
* the limit
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,59 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("urlMatcher")
|
||||
public class UrlMatcher extends Levenstein {
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
public UrlMatcher(Map<String, Number> params){
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public UrlMatcher(double weight, Map<String, Number> params) {
|
||||
super(weight);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, Number> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(Field a, Field b) {
|
||||
|
||||
final URL urlA = asUrl(getFirstValue(a));
|
||||
final URL urlB = asUrl(getFirstValue(b));
|
||||
|
||||
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Double hostW = params.get("host").doubleValue();
|
||||
Double pathW = params.get("path").doubleValue();
|
||||
|
||||
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
|
||||
return hostW * 0.5;
|
||||
}
|
||||
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
/**
|
||||
* Created by claudio on 09/03/16.
|
||||
*/
|
||||
public class ConditionEval {
|
||||
|
||||
private String cond;
|
||||
|
||||
private Field a;
|
||||
|
||||
private Field b;
|
||||
|
||||
private int result;
|
||||
|
||||
public ConditionEval(final String cond, final Field a, final Field b, final int result) {
|
||||
this.cond = cond;
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
public Field getA() {
|
||||
return a;
|
||||
}
|
||||
|
||||
public void setA(final Field a) {
|
||||
this.a = a;
|
||||
}
|
||||
|
||||
public Field getB() {
|
||||
return b;
|
||||
}
|
||||
|
||||
public void setB(final Field b) {
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
public int getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setResult(final int result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
public String getCond() {
|
||||
return cond;
|
||||
}
|
||||
|
||||
public void setCond(final String cond) {
|
||||
this.cond = cond;
|
||||
}
|
||||
}
|
|
@ -1,38 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
/**
|
||||
* Created by claudio on 09/03/16.
|
||||
*/
|
||||
public class ConditionEvalMap extends HashMap<String, ConditionEval> {
|
||||
|
||||
|
||||
public ConditionEvalMap mergeFrom(ConditionEvalMap map) {
|
||||
putAll(map);
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean anyNegative() {
|
||||
return values().stream()
|
||||
.allMatch(ec -> ec.getResult() < 0);
|
||||
}
|
||||
|
||||
public boolean isZero() {
|
||||
return result() == 0;
|
||||
}
|
||||
|
||||
public int result() {
|
||||
int res = 0;
|
||||
for(ConditionEval ec : values()) {
|
||||
final int verify = ec.getResult();
|
||||
if (verify < 0) return -1;
|
||||
res += verify;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Created by claudio on 09/03/16.
|
||||
*/
|
||||
public class DistanceEval {
|
||||
|
||||
private FieldDef fieldDef;
|
||||
|
||||
private Field a;
|
||||
|
||||
private Field b;
|
||||
|
||||
private double distance = 0.0;
|
||||
|
||||
public DistanceEval(final FieldDef fieldDef, final Field a, final Field b) {
|
||||
this.fieldDef = fieldDef;
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
public Field getA() {
|
||||
return a;
|
||||
}
|
||||
|
||||
public void setA(final Field a) {
|
||||
this.a = a;
|
||||
}
|
||||
|
||||
public Field getB() {
|
||||
return b;
|
||||
}
|
||||
|
||||
public void setB(final Field b) {
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
public FieldDef getFieldDef() {
|
||||
return fieldDef;
|
||||
}
|
||||
|
||||
public void setFieldDef(final FieldDef fieldDef) {
|
||||
this.fieldDef = fieldDef;
|
||||
}
|
||||
|
||||
public double getDistance() {
|
||||
return distance;
|
||||
}
|
||||
|
||||
public void setDistance(final double distance) {
|
||||
this.distance = distance;
|
||||
}
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.condition.*;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
public class CondDef implements Serializable {
|
||||
|
||||
private String name;
|
||||
|
||||
private List<String> fields;
|
||||
|
||||
public CondDef() {}
|
||||
|
||||
public ConditionAlgo conditionAlgo(final List<FieldDef> fields) {
|
||||
return PaceConfig.resolver.getConditionAlgo(getName(), fields);
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public List<String> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
public void setFields(final List<String> fields) {
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -3,15 +3,10 @@ package eu.dnetlib.pace.model;
|
|||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
|
||||
|
@ -26,16 +21,6 @@ public class FieldDef implements Serializable {
|
|||
|
||||
private Type type;
|
||||
|
||||
private boolean ignoreMissing;
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
private boolean overrideMatch;
|
||||
|
||||
/**
|
||||
|
@ -48,8 +33,6 @@ public class FieldDef implements Serializable {
|
|||
*/
|
||||
private int length = -1;
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
public FieldDef() {}
|
||||
|
||||
// def apply(s: String): Field[A]
|
||||
|
@ -110,14 +93,6 @@ public class FieldDef implements Serializable {
|
|||
this.length = length;
|
||||
}
|
||||
|
||||
public Map<String, Number> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public void setParams(final Map<String, Number> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
@ -24,7 +23,7 @@ public class AlwaysMatch extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double compare(final Field a, final Field b) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* The Class ExactMatch.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ComparatorClass("doiExactMatch")
|
||||
public class DoiExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
public DoiExactMatch(final Map<String, Number> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getValue(final Field f) {
|
||||
return super.getValue(f).replaceAll(PREFIX, "");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("domainExactMatch")
|
||||
public class DomainExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public DomainExactMatch(final Map<String, Number> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getValue(final Field f) {
|
||||
try {
|
||||
return asUrl(super.getValue(f)).getHost();
|
||||
} catch (MalformedURLException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) throws MalformedURLException {
|
||||
return new URL(value);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("exactMatchIgnoreCase")
|
||||
public class ExactMatchIgnoreCase extends AbstractComparator {
|
||||
|
||||
public ExactMatchIgnoreCase(Map<String, Number> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b) {
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
||||
if (fa.isEmpty() || fb.isEmpty())
|
||||
return -1;
|
||||
|
||||
return fa.equalsIgnoreCase(fb) ? 1 : 0;
|
||||
}
|
||||
|
||||
protected String getValue(final Field f) {
|
||||
return getFirstValue(f);
|
||||
}
|
||||
}
|
|
@ -1,12 +1,9 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.logging.Log;
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.adaptor.Pid;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ComparatorClass("pidMatch")
|
||||
public class PidMatch extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(PidMatch.class);
|
||||
private Map<String, Number> params;
|
||||
|
||||
public PidMatch(final Map<String, Number> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
|
||||
final List<String> sa = ((FieldList) a).stringList();
|
||||
final List<String> sb = ((FieldList) b).stringList();
|
||||
|
||||
final List<Pid> pal = Pid.fromOafJson(sa);
|
||||
final List<Pid> pbl = Pid.fromOafJson(sb);
|
||||
|
||||
if (pal.isEmpty() || pbl.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> pidAset = toHashSet(pal);
|
||||
final Set<String> pidBset = toHashSet(pbl);
|
||||
|
||||
int incommon = Sets.intersection(pidAset, pidBset).size();
|
||||
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return (double)incommon / (incommon + simDiff) > params.getOrDefault("threshold", 0.5).doubleValue() ? 1 : 0;
|
||||
|
||||
}
|
||||
|
||||
//lowercase + normalization of the pid before adding it to the set
|
||||
private Set<String> toHashSet(List<Pid> pbl) {
|
||||
|
||||
return pbl.stream()
|
||||
.map(pid -> pid.getType() + normalizePid(pid.getValue()))
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* Returns true if the number of values in the fields is the same.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ComparatorClass("sizeMatch")
|
||||
public class SizeMatch extends AbstractComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new size match.
|
||||
*
|
||||
* @param params
|
||||
* the parameters
|
||||
*/
|
||||
public SizeMatch(final Map<String, Number> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
|
||||
return Iterables.size(a) == Iterables.size(b) ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if is empty.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @return true, if is empty
|
||||
*/
|
||||
protected boolean isEmpty(final Iterable<?> a) {
|
||||
return (a == null) || Iterables.isEmpty(a);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
|
|
@ -2,8 +2,6 @@ package eu.dnetlib.pace.tree;
|
|||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* Returns true if the titles in the given documents contains the same numbers, false otherwise.
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
*/
|
||||
@ComparatorClass("titleVersionMatch")
|
||||
public class TitleVersionMatch extends AbstractComparator {
|
||||
|
||||
public TitleVersionMatch(final Map<String, Number> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
final String valueA = getFirstValue(a);
|
||||
final String valueB = getFirstValue(b);
|
||||
|
||||
if (valueA.isEmpty() || valueB.isEmpty())
|
||||
return -1;
|
||||
|
||||
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ComparatorClass("yearMatch")
|
||||
public class YearMatch extends AbstractComparator {
|
||||
|
||||
private int limit = 4;
|
||||
|
||||
public YearMatch(final Map<String, Number> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
final String valueA = getNumbers(getFirstValue(a));
|
||||
final String valueB = getNumbers(getFirstValue(b));
|
||||
|
||||
if (valueA.isEmpty() || valueB.isEmpty())
|
||||
return -1;
|
||||
|
||||
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
|
||||
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
|
||||
|
||||
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
|
||||
}
|
||||
|
||||
protected boolean checkLength(final String s) {
|
||||
return s.length() == limit;
|
||||
}
|
||||
|
||||
protected String getFirstValue(final Field value) {
|
||||
return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
}
|
|
@ -19,6 +19,10 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
|||
|
||||
private Map<String, Number> params;
|
||||
|
||||
protected AbstractComparator(Map<String, Number> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
protected AbstractComparator(Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||
this.params = params;
|
||||
this.weight = 1.0;
|
||||
|
@ -49,7 +53,9 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
|||
* the d
|
||||
* @return the double
|
||||
*/
|
||||
protected abstract double normalize(double d);
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
|
|
|
@ -4,6 +4,10 @@ import eu.dnetlib.pace.model.Field;
|
|||
|
||||
public interface Comparator {
|
||||
|
||||
/*
|
||||
* return : -1 -> can't decide (missing field)
|
||||
* >0 -> similarity degree (depends on the algorithm)
|
||||
* */
|
||||
public double compare(Field a, Field b);
|
||||
|
||||
}
|
||||
|
|
|
@ -3,7 +3,6 @@ package eu.dnetlib.pace.tree.support;
|
|||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -36,12 +35,10 @@ public class TreeNodeDef implements Serializable {
|
|||
public TreeNodeDef() {
|
||||
}
|
||||
|
||||
public double evaluate(MapDocument doc1, MapDocument doc2) {
|
||||
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2) {
|
||||
|
||||
DescriptiveStatistics stats = new DescriptiveStatistics();
|
||||
double sumWeights = 0.0; //for the weighted mean
|
||||
|
||||
int missCount = 0; //counter for the number of misses
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
stats.setFieldsCount(fields.size());
|
||||
|
||||
for (FieldConf fieldConf : fields) {
|
||||
|
||||
|
@ -49,40 +46,20 @@ public class TreeNodeDef implements Serializable {
|
|||
|
||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
|
||||
|
||||
if (result >= 0.0) { //if the field is not missing
|
||||
stats.addValue(weight * result);
|
||||
sumWeights += weight; //sum weights, to be used in case of weighted mean
|
||||
}
|
||||
else { //if the field is missing
|
||||
missCount += 1;
|
||||
if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored
|
||||
stats.addValue(weight * 0);
|
||||
sumWeights += weight;
|
||||
if (result == -1) { //if the field is missing
|
||||
stats.incrementMissCount();
|
||||
if (!fieldConf.isIgnoreMissing()) {
|
||||
stats.incrementWeightsSum(weight);
|
||||
}
|
||||
}
|
||||
else { //if the field is not missing
|
||||
stats.incrementScoresSum(weight * result);
|
||||
stats.incrementWeightsSum(weight);
|
||||
}
|
||||
|
||||
//global ignoremissing (if one of the field is missing, return undefined)
|
||||
if (!ignoreMissing && missCount>0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
switch (aggregation){
|
||||
|
||||
case AVG:
|
||||
return stats.getMean();
|
||||
case SUM:
|
||||
return stats.getSum();
|
||||
case MAX:
|
||||
return stats.getMax();
|
||||
case MIN:
|
||||
return stats.getMin();
|
||||
case WEIGHTED_MEAN:
|
||||
return stats.getSum()/sumWeights;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
private Comparator comparator(final FieldConf field){
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class TreeNodeStats implements Serializable {
|
||||
|
||||
private DescriptiveStatistics stats;
|
||||
private int missCount = 0;
|
||||
private int fieldsCount = 0;
|
||||
private double weightsSum = 0.0;
|
||||
|
||||
public TreeNodeStats(){
|
||||
this.stats = new DescriptiveStatistics();
|
||||
}
|
||||
|
||||
public TreeNodeStats(int missCount, int fieldsCount, double weightsSum) {
|
||||
this.missCount = missCount;
|
||||
this.fieldsCount = fieldsCount;
|
||||
this.weightsSum = weightsSum;
|
||||
}
|
||||
|
||||
public DescriptiveStatistics getStats() {
|
||||
return stats;
|
||||
}
|
||||
|
||||
public void setStats(DescriptiveStatistics stats) {
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
public int getMissCount() {
|
||||
return missCount;
|
||||
}
|
||||
|
||||
public void setMissCount(int missCount) {
|
||||
this.missCount = missCount;
|
||||
}
|
||||
|
||||
public int getFieldsCount() {
|
||||
return fieldsCount;
|
||||
}
|
||||
|
||||
public void setFieldsCount(int fields) {
|
||||
this.fieldsCount = fields;
|
||||
}
|
||||
|
||||
public double getWeightsSum() {
|
||||
return weightsSum;
|
||||
}
|
||||
|
||||
public void setWeightsSum(double weightsSum) {
|
||||
this.weightsSum = weightsSum;
|
||||
}
|
||||
|
||||
public void incrementWeightsSum(double delta){
|
||||
this.weightsSum += delta;
|
||||
}
|
||||
|
||||
public void incrementMissCount(){
|
||||
this.missCount += 1;
|
||||
}
|
||||
|
||||
public void incrementScoresSum(double delta){
|
||||
this.stats.addValue(delta);
|
||||
}
|
||||
|
||||
public double getFinalScore(AggType aggregation){
|
||||
|
||||
switch (aggregation){
|
||||
case AVG:
|
||||
return stats.getMean();
|
||||
case SUM:
|
||||
return stats.getSum();
|
||||
case MAX:
|
||||
return stats.getMax();
|
||||
case MIN:
|
||||
return stats.getMin();
|
||||
case WEIGHTED_MEAN:
|
||||
return stats.getSum()/weightsSum;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The compare between two documents is given by the weighted mean of the field distances
|
||||
*/
|
||||
public class TreeProcessor {
|
||||
|
||||
private static final Log log = LogFactory.getLog(TreeProcessor.class);
|
||||
|
||||
private Config config;
|
||||
|
||||
public TreeProcessor(final Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public boolean compare(final MapDocument a, final MapDocument b) {
|
||||
|
||||
//evaluate the decision tree
|
||||
return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH;
|
||||
}
|
||||
|
||||
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map<String, TreeNodeDef> decisionTree){
|
||||
|
||||
String current = "start";
|
||||
|
||||
while (MatchType.parse(current)==MatchType.UNDEFINED) {
|
||||
|
||||
TreeNodeDef currentNode = decisionTree.get(current);
|
||||
//throw an exception if the node doesn't exist
|
||||
if (currentNode == null)
|
||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
||||
|
||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2);
|
||||
|
||||
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
|
||||
current = currentNode.getUndefined();
|
||||
}
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
current = currentNode.getPositive();
|
||||
}
|
||||
else {
|
||||
current = currentNode.getNegative();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return MatchType.parse(current);
|
||||
}
|
||||
|
||||
}
|
|
@ -5,7 +5,7 @@ import eu.dnetlib.pace.clustering.NGramUtils;
|
|||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.WfConfig;
|
||||
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
|
||||
import eu.dnetlib.pace.distance.PairwiseComparison;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
||||
|
@ -150,14 +150,10 @@ public class BlockProcessor {
|
|||
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf);
|
||||
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
|
||||
|
||||
emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context);
|
||||
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
|
||||
|
||||
// final ScoreResult sr = similarity(algo, pivot, curr);
|
||||
//// log.info(sr.toString()+"SCORE "+ sr.getScore());
|
||||
// emitOutput(sr, idPivot, idCurr, context);
|
||||
// i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,31 +2,21 @@ package eu.dnetlib.pace.util;
|
|||
|
||||
import eu.dnetlib.pace.clustering.ClusteringClass;
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||
import eu.dnetlib.pace.condition.ConditionClass;
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.reflections.Reflections;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PaceResolver implements Serializable {
|
||||
|
||||
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
|
||||
public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition");
|
||||
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo");
|
||||
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
|
||||
|
||||
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
||||
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
|
||||
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
|
||||
private final Map<String, Class<Comparator>> comparators;
|
||||
|
||||
public PaceResolver() {
|
||||
|
@ -35,14 +25,6 @@ public class PaceResolver implements Serializable {
|
|||
.filter(ClusteringFunction.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
||||
|
||||
this.conditionAlgos = CONDITION_RESOLVER.getTypesAnnotatedWith(ConditionClass.class).stream()
|
||||
.filter(ConditionAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
|
||||
|
||||
this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream()
|
||||
.filter(DistanceAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||
|
||||
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
|
||||
.filter(Comparator.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
|
||||
|
@ -56,22 +38,6 @@ public class PaceResolver implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public DistanceAlgo getDistanceAlgo(String name, Map<String, Number> params) throws PaceException {
|
||||
try {
|
||||
return distanceAlgos.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public ConditionAlgo getConditionAlgo(String name, List<FieldDef> fields) throws PaceException {
|
||||
try {
|
||||
return conditionAlgos.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
|
||||
try {
|
||||
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
package eu.dnetlib.pace.comparators;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
|
||||
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
|
||||
public class ConditionTest extends AbstractPaceTest {
|
||||
|
||||
}
|
|
@ -12,16 +12,16 @@
|
|||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
||||
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
|
||||
],
|
||||
"sufficientConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
|
||||
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
|
||||
],
|
||||
"necessaryConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] },
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
|
||||
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
|
||||
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
||||
|
|
Loading…
Reference in New Issue