forked from D-Net/dnet-hadoop
apply limits (length, size) to pace Fields
This commit is contained in:
parent
c5979ffe18
commit
f1c68d8ba3
|
@ -1,23 +1,28 @@
|
||||||
package eu.dnetlib.pace.distance;
|
package eu.dnetlib.pace.distance;
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||||
import eu.dnetlib.pace.distance.eval.DistanceEval;
|
import eu.dnetlib.pace.distance.eval.DistanceEval;
|
||||||
import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
|
import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
|
||||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||||
import eu.dnetlib.pace.model.Document;
|
import eu.dnetlib.pace.model.*;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The distance between two documents is given by the weighted mean of the field distances
|
* The distance between two documents is given by the weighted mean of the field distances
|
||||||
*/
|
*/
|
||||||
public class DistanceScorer {
|
public class DistanceScorer {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(DistanceScorer.class);
|
||||||
|
|
||||||
private Config config;
|
private Config config;
|
||||||
|
|
||||||
public DistanceScorer(final Config config) {
|
public DistanceScorer(final Config config) {
|
||||||
|
@ -79,7 +84,7 @@ public class DistanceScorer {
|
||||||
if (va.getType().equals(vb.getType())) {
|
if (va.getType().equals(vb.getType())) {
|
||||||
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
|
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
|
||||||
} else {
|
} else {
|
||||||
throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return de;
|
return de;
|
||||||
|
@ -87,7 +92,27 @@ public class DistanceScorer {
|
||||||
}
|
}
|
||||||
|
|
||||||
private Field getValue(final Document d, final FieldDef fd) {
|
private Field getValue(final Document d, final FieldDef fd) {
|
||||||
return d.values(fd.getName());
|
final Field v = d.values(fd.getName());
|
||||||
|
if (fd.getLength() > 0) {
|
||||||
|
|
||||||
|
if (v instanceof FieldValueImpl) {
|
||||||
|
((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
|
||||||
|
} else if (v instanceof FieldListImpl) {
|
||||||
|
List<String> strings = ((FieldListImpl) v).stringList();
|
||||||
|
strings = strings.stream()
|
||||||
|
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||||
|
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
((FieldListImpl) v).clear();
|
||||||
|
((FieldListImpl) v).addAll(strings.stream()
|
||||||
|
.limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
|
||||||
|
.map(s -> StringUtils.substring(s, 0, fd.getLength()))
|
||||||
|
.map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double sumWeights(final Collection<FieldDef> fields) {
|
private double sumWeights(final Collection<FieldDef> fields) {
|
||||||
|
|
|
@ -2,13 +2,18 @@ package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceScorer;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@DistanceClass("LevensteinTitle")
|
@DistanceClass("LevensteinTitle")
|
||||||
public class LevensteinTitle extends SecondStringDistanceAlgo {
|
public class LevensteinTitle extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
||||||
|
|
||||||
public LevensteinTitle(Map<String,Number> params){
|
public LevensteinTitle(Map<String,Number> params){
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,8 +2,12 @@ package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
public class PaceException extends RuntimeException {
|
public class PaceException extends RuntimeException {
|
||||||
|
|
||||||
public PaceException(String s, Throwable e){
|
public PaceException(String s, Throwable e) {
|
||||||
super(s, e);
|
super(s, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public PaceException(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue