From f1c68d8ba3650cad783c05e3163102fefc6afb35 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 20 Nov 2018 10:51:38 +0100 Subject: [PATCH] apply limits (length, size) to pace Fields --- .../dnetlib/pace/distance/DistanceScorer.java | 41 +++++++++++++++---- .../pace/distance/algo/LevensteinTitle.java | 5 +++ .../eu/dnetlib/pace/util/PaceException.java | 6 ++- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java index 115fd1a1e..bb3c37ed6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java @@ -1,23 +1,28 @@ package eu.dnetlib.pace.distance; -import java.util.Collection; -import java.util.List; - import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.distance.eval.DistanceEval; import eu.dnetlib.pace.distance.eval.DistanceEvalMap; import eu.dnetlib.pace.distance.eval.ScoreResult; -import eu.dnetlib.pace.model.Document; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.model.*; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; /** * The distance between two documents is given by the weighted mean of the field distances */ public class DistanceScorer { + private static final Log log = LogFactory.getLog(DistanceScorer.class); + private Config config; public DistanceScorer(final Config config) { @@ -79,7 +84,7 @@ public class DistanceScorer { if (va.getType().equals(vb.getType())) { de.setDistance(w * fd.distanceAlgo().distance(va, vb)); } else { - throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType())); + throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType())); } } return de; @@ -87,7 +92,27 @@ public class DistanceScorer { } private Field getValue(final Document d, final FieldDef fd) { - return d.values(fd.getName()); + final Field v = d.values(fd.getName()); + if (fd.getLength() > 0) { + + if (v instanceof FieldValueImpl) { + ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength())); + } else if (v instanceof FieldListImpl) { + List strings = ((FieldListImpl) v).stringList(); + strings = strings.stream() + .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) + .map(s -> StringUtils.substring(s, 0, fd.getLength())) + .collect(Collectors.toList()); + ((FieldListImpl) v).clear(); + ((FieldListImpl) v).addAll(strings.stream() + .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) + .map(s -> StringUtils.substring(s, 0, fd.getLength())) + .map(s -> new FieldValueImpl(v.getType(), v.getName(), s)) + .collect(Collectors.toList())); + } + } + + return v; } private double sumWeights(final Collection fields) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java index 10de8597c..503dc33b2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java @@ -2,13 +2,18 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.DistanceScorer; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import java.util.Map; @DistanceClass("LevensteinTitle") public class LevensteinTitle extends SecondStringDistanceAlgo { + private static final Log log = LogFactory.getLog(LevensteinTitle.class); + public LevensteinTitle(Map params){ super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java index 34fd8ba20..19c546f03 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java @@ -2,8 +2,12 @@ package eu.dnetlib.pace.util; public class PaceException extends RuntimeException { - public PaceException(String s, Throwable e){ + public PaceException(String s, Throwable e) { super(s, e); } + public PaceException(String s) { + super(s); + } + }