From 4d379c2227d8a1646ac461fd14bbfa6094f3d7b4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Sat, 20 Oct 2018 08:38:19 +0200 Subject: [PATCH] revised PidMatch implementation, cleanup --- .../eu/dnetlib/pace/condition/PidMatch.java | 25 ++++++++----- .../pace/distance/algo/LevensteinDate.java | 25 ------------- .../pace/distance/algo/YearLevenstein.java | 37 ------------------- .../java/eu/dnetlib/pace/model/FieldDef.java | 2 - 4 files changed, 15 insertions(+), 74 deletions(-) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java index 4f9e0423d..a20ab9528 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java @@ -1,7 +1,11 @@ package eu.dnetlib.pace.condition; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import com.google.common.collect.Sets; import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; @@ -33,20 +37,21 @@ public class PidMatch extends AbstractCondition { final List pal = Pid.fromOafJson(sa); final List pbl = Pid.fromOafJson(sb); - int result = 0; - for(Pid pa : pal) { - final String ta = pa.getType(); + final Set pidAset = toHashSet(pal); + final Set pidBset = toHashSet(pbl); - for(Pid pb : pbl) { - final String tb = pb.getType(); + int incommon = Sets.intersection(pidAset, pidBset).size(); + int simDiff = Sets.symmetricDifference(pidAset, pidBset).size(); - if (tb.equalsIgnoreCase(ta)) { - result += pa.getValue().equalsIgnoreCase(pb.getValue()) ? 1 : -1; - } - } - } + int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1; return new ConditionEval(cond, a, b, result); } + private Set toHashSet(List pbl) { + return pbl.stream() + .map(pid -> pid.getType() + pid.getValue()) + .collect(Collectors.toCollection(HashSet::new)); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java deleted file mode 100644 index 545295567..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java +++ /dev/null @@ -1,25 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - - -public class LevensteinDate extends Levenstein { - - - public LevensteinDate(double w) { - super(w); - } - - - @Override - public double distance(String a, String b) { - - return 1.0; - } - - - - @Override - public double getWeight() { - return super.weight; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java deleted file mode 100644 index 4e9796c2d..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java +++ /dev/null @@ -1,37 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - - -public class YearLevenstein extends SubStringLevenstein { - - public YearLevenstein(double w) { - super(w); - } - - public YearLevenstein(double w, int limit) { - super(w, limit); - } - - @Override - public double distance(String a, String b) { - boolean check = checkLength(a) && checkLength(b); - if (check) { - if (a.equals(b)) { - return 1.0; - } else { - return 0.5; - } - } else { - return 1.0; - } - } - - protected boolean checkLength(String s) { - return getNumbers(s).length() == limit; - } - - @Override - public double getWeight() { - return super.weight; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index 8507b0cd7..5445053bd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -83,8 +83,6 @@ public class FieldDef implements Serializable { return new LevensteinTitle(getWeight()); case SubStringLevenstein: return new SubStringLevenstein(getWeight(), getLimit()); - case YearLevenstein: - return new YearLevenstein(getWeight(), getLimit()); case SortedJaroWinkler: return new SortedJaroWinkler(getWeight()); case SortedLevel2JaroWinkler: