added distance function fot software titles

This commit is contained in:
Claudio Atzori 2018-11-17 09:11:38 +01:00
parent 55a9b4f501
commit 0dfb2ea600
2 changed files with 64 additions and 0 deletions

View File

@ -0,0 +1,55 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
/**
* Compared distance between two titles, ignoring version numbers. Suitable for Software entities.
*/
@DistanceClass("LevensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo {
public LevensteinTitleIgnoreVersion(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitleIgnoreVersion(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
import eu.dnetlib.pace.distance.algo.LevensteinTitleIgnoreVersion;
import org.apache.commons.lang.StringUtils;
import org.junit.Before;
import org.junit.Test;
@ -52,4 +53,12 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
System.out.println(result);
}
@Test
public void testLevensteinTitleIgnoreVersion() {
final LevensteinTitleIgnoreVersion algo = new LevensteinTitleIgnoreVersion(params);
double result = algo.distance("gCube data layer v1.0 XI", "gCube data layer v1.5 VIII");
System.out.println(result);
}
}