2018-10-02 10:37:54 +02:00
|
|
|
package eu.dnetlib.pace.distance;
|
|
|
|
|
|
|
|
import eu.dnetlib.pace.condition.ConditionAlgo;
|
|
|
|
import eu.dnetlib.pace.config.Config;
|
|
|
|
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
|
|
|
import eu.dnetlib.pace.distance.eval.DistanceEval;
|
|
|
|
import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
|
|
|
|
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
2018-11-20 10:51:38 +01:00
|
|
|
import eu.dnetlib.pace.model.*;
|
|
|
|
import eu.dnetlib.pace.util.PaceException;
|
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
|
import org.apache.commons.logging.Log;
|
|
|
|
import org.apache.commons.logging.LogFactory;
|
|
|
|
|
|
|
|
import java.util.Collection;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.stream.Collectors;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* The distance between two documents is given by the weighted mean of the field distances
|
|
|
|
*/
|
|
|
|
public class DistanceScorer {
|
|
|
|
|
2018-11-20 10:51:38 +01:00
|
|
|
private static final Log log = LogFactory.getLog(DistanceScorer.class);
|
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
private Config config;
|
|
|
|
|
|
|
|
public DistanceScorer(final Config config) {
|
|
|
|
this.config = config;
|
|
|
|
}
|
|
|
|
|
|
|
|
public ScoreResult distance(final Document a, final Document b) {
|
2018-10-24 12:09:41 +02:00
|
|
|
final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
sr.setStrictConditions(verify(a, b, config.strictConditions()));
|
|
|
|
sr.setConditions(verify(a, b, config.conditions()));
|
|
|
|
|
|
|
|
final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model()));
|
|
|
|
|
|
|
|
for (final FieldDef fd : config.model()) {
|
|
|
|
|
|
|
|
dMap.updateDistance(fieldDistance(a, b, fd));
|
|
|
|
}
|
|
|
|
sr.setDistances(dMap);
|
|
|
|
return sr;
|
|
|
|
}
|
|
|
|
|
|
|
|
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
|
|
|
|
final ConditionEvalMap res = new ConditionEvalMap();
|
|
|
|
|
|
|
|
for (final ConditionAlgo cd : conditions) {
|
|
|
|
final ConditionEvalMap map = cd.verify(a, b);
|
|
|
|
res.mergeFrom(map);
|
|
|
|
|
|
|
|
// commented out shortcuts
|
|
|
|
/*
|
|
|
|
if (map.anyNegative()) {
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
|
|
|
//if (strict && (res < 0)) return -1;
|
|
|
|
//cond += verify;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) {
|
|
|
|
|
|
|
|
final double w = fd.getWeight();
|
|
|
|
final Field va = getValue(a, fd);
|
|
|
|
final Field vb = getValue(b, fd);
|
|
|
|
|
|
|
|
final DistanceEval de = new DistanceEval(fd, va, vb);
|
|
|
|
if ((w == 0)) return de; // optimization for 0 weight
|
|
|
|
else {
|
|
|
|
if (va.isEmpty() || vb.isEmpty()) {
|
|
|
|
if (fd.isIgnoreMissing()) {
|
|
|
|
de.setDistance(-1);
|
|
|
|
} else {
|
|
|
|
de.setDistance(w);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (va.getType().equals(vb.getType())) {
|
2018-10-29 16:16:11 +01:00
|
|
|
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
|
2018-10-02 10:37:54 +02:00
|
|
|
} else {
|
2018-11-20 10:51:38 +01:00
|
|
|
throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return de;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private Field getValue(final Document d, final FieldDef fd) {
|
2018-11-20 10:51:38 +01:00
|
|
|
final Field v = d.values(fd.getName());
|
|
|
|
if (fd.getLength() > 0) {
|
|
|
|
|
|
|
|
if (v instanceof FieldValueImpl) {
|
|
|
|
((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
|
|
|
|
} else if (v instanceof FieldListImpl) {
|
|
|
|
List<String> strings = ((FieldListImpl) v).stringList();
|
|
|
|
strings = strings.stream()
|
|
|
|
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
|
|
|
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
((FieldListImpl) v).clear();
|
|
|
|
((FieldListImpl) v).addAll(strings.stream()
|
|
|
|
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
|
|
|
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
|
|
|
.map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
|
|
|
|
.collect(Collectors.toList()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return v;
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
private double sumWeights(final Collection<FieldDef> fields) {
|
|
|
|
double sum = 0.0;
|
|
|
|
for (final FieldDef fd : fields) {
|
|
|
|
sum += fd.getWeight();
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|