From 0dfb2ea600a4eae81b568b544ec21a47cb515461 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Sat, 17 Nov 2018 09:11:38 +0100 Subject: [PATCH] added distance function fot software titles --- .../algo/LevensteinTitleIgnoreVersion.java | 55 +++++++++++++++++++ .../pace/distance/DistanceAlgoTest.java | 9 +++ 2 files changed, 64 insertions(+) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java new file mode 100644 index 000000000..0d8dd609c --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java @@ -0,0 +1,55 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +import java.util.Map; + +/** + * Compared distance between two titles, ignoring version numbers. Suitable for Software entities. + */ +@DistanceClass("LevensteinTitleIgnoreVersion") +public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo { + + public LevensteinTitleIgnoreVersion(Map params){ + super(params, new com.wcohen.ss.Levenstein()); + } + + public LevensteinTitleIgnoreVersion(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim(); + cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim(); + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); + } + + private double normalize(final double score, final int la, final int lb) { + return 1 - (Math.abs(score) / Math.max(la, lb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index a9068a893..4380a6f3e 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance; import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName; +import eu.dnetlib.pace.distance.algo.LevensteinTitleIgnoreVersion; import org.apache.commons.lang.StringUtils; import org.junit.Before; import org.junit.Test; @@ -52,4 +53,12 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { System.out.println(result); } + @Test + public void testLevensteinTitleIgnoreVersion() { + final LevensteinTitleIgnoreVersion algo = new LevensteinTitleIgnoreVersion(params); + double result = algo.distance("gCube data layer v1.0 XI", "gCube data layer v1.5 VIII"); + + System.out.println(result); + } + }