From bc4505e0e647cb7beec9ec0782d91699f9c81e04 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Sat, 20 Oct 2018 08:38:19 +0200 Subject: [PATCH] revised PidMatch implementation, cleanup --- .../dnetlib/pace/AbstractProtoPaceTest.java | 12 ++++- .../dnetlib/pace/distance/DetectorTest.java | 49 ++++++++++++++++++- .../eu/dnetlib/pace/result.pace.conf | 2 +- .../eu/dnetlib/pace/condition/PidMatch.java | 25 ++++++---- .../pace/distance/algo/LevensteinDate.java | 25 ---------- .../pace/distance/algo/YearLevenstein.java | 37 -------------- .../java/eu/dnetlib/pace/model/FieldDef.java | 2 - 7 files changed, 74 insertions(+), 78 deletions(-) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java index 961fdd6..838836b 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import com.google.gson.Gson; import eu.dnetlib.data.proto.FieldTypeProtos.Author; import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; @@ -21,12 +22,14 @@ import eu.dnetlib.pace.model.gt.GTAuthor; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.RandomUtils; import java.io.IOException; import java.io.StringWriter; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -101,6 +104,13 @@ public abstract class AbstractProtoPaceTest extends OafTest { return result(config, id, title, date, Lists.newArrayList(pid), authors); } + static List pidTypes = Lists.newArrayList(); + static { + pidTypes.add("doi"); + //pidTypes.add("oai"); + //pidTypes.add("pmid"); + } + protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid, final List authors) { final Result.Metadata.Builder metadata = Result.Metadata.newBuilder(); if (!StringUtils.isBlank(title)) { @@ -126,7 +136,7 @@ public abstract class AbstractProtoPaceTest extends OafTest { if (pid != null) { for(String p : pid) { if (!StringUtils.isBlank(p)) { - entity.addPid(sp(p, "doi")); + entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1)))); //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai")); } } diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java index e2d3ad7..7a265e5 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java @@ -108,8 +108,8 @@ public class DetectorTest extends AbstractProtoPaceTest { public void testDistanceResultMissingTwoDate() { final Config config = getResultConf(); - final MapDocument resA = result(config, "A", "title title title 6BESR"); - final MapDocument resB = result(config, "B", "title title title 6CLER"); + final MapDocument resA = result(config, "A", "bellaciao"); + final MapDocument resB = result(config, "B", "bellocioa"); final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); double d = sr.getScore(); @@ -326,6 +326,51 @@ public class DetectorTest extends AbstractProtoPaceTest { // assertTrue(d.getScore() == 0.0); } + @Test + public void testDistanceResultNoPidsConf() { + + final Config config = getResultFullConf(); + + final MapDocument resA = + result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010"); + + final MapDocument resB = + result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double s = sr.getScore(); + + log.info(sr.toString()); + log.info(String.format(" s ---> %s", s)); + // assertTrue(d.getScore() == 0.0); + } + + @Test + public void testDistanceResultPidsConf() { + + final Config config = getResultFullConf(); + + final List authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); + final List authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); + + final List pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b"); + final MapDocument resA = + result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", + pidA, authorsA); + + final List pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d"); + final MapDocument resB = + result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010", + pidB, authorsB); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double s = sr.getScore(); + log.info(sr.toString()); + log.info(String.format(" s ---> %s", s)); + + // assertTrue(d.getScore() == 0.0); + } + @Test public void testDistanceResultFullConf() { diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf index 86dd27f..d17e7a3 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf @@ -20,7 +20,7 @@ ], "model" : [ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "title", "algo" : "Level2Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ], "blacklists" : { } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java index 4f9e042..a20ab95 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java @@ -1,7 +1,11 @@ package eu.dnetlib.pace.condition; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import com.google.common.collect.Sets; import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; @@ -33,20 +37,21 @@ public class PidMatch extends AbstractCondition { final List pal = Pid.fromOafJson(sa); final List pbl = Pid.fromOafJson(sb); - int result = 0; - for(Pid pa : pal) { - final String ta = pa.getType(); + final Set pidAset = toHashSet(pal); + final Set pidBset = toHashSet(pbl); - for(Pid pb : pbl) { - final String tb = pb.getType(); + int incommon = Sets.intersection(pidAset, pidBset).size(); + int simDiff = Sets.symmetricDifference(pidAset, pidBset).size(); - if (tb.equalsIgnoreCase(ta)) { - result += pa.getValue().equalsIgnoreCase(pb.getValue()) ? 1 : -1; - } - } - } + int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1; return new ConditionEval(cond, a, b, result); } + private Set toHashSet(List pbl) { + return pbl.stream() + .map(pid -> pid.getType() + pid.getValue()) + .collect(Collectors.toCollection(HashSet::new)); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java deleted file mode 100644 index 5452955..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java +++ /dev/null @@ -1,25 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - - -public class LevensteinDate extends Levenstein { - - - public LevensteinDate(double w) { - super(w); - } - - - @Override - public double distance(String a, String b) { - - return 1.0; - } - - - - @Override - public double getWeight() { - return super.weight; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java deleted file mode 100644 index 4e9796c..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java +++ /dev/null @@ -1,37 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - - -public class YearLevenstein extends SubStringLevenstein { - - public YearLevenstein(double w) { - super(w); - } - - public YearLevenstein(double w, int limit) { - super(w, limit); - } - - @Override - public double distance(String a, String b) { - boolean check = checkLength(a) && checkLength(b); - if (check) { - if (a.equals(b)) { - return 1.0; - } else { - return 0.5; - } - } else { - return 1.0; - } - } - - protected boolean checkLength(String s) { - return getNumbers(s).length() == limit; - } - - @Override - public double getWeight() { - return super.weight; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index 8507b0c..5445053 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -83,8 +83,6 @@ public class FieldDef implements Serializable { return new LevensteinTitle(getWeight()); case SubStringLevenstein: return new SubStringLevenstein(getWeight(), getLimit()); - case YearLevenstein: - return new YearLevenstein(getWeight(), getLimit()); case SortedJaroWinkler: return new SortedJaroWinkler(getWeight()); case SortedLevel2JaroWinkler: