forked from D-Net/dnet-hadoop
implementation of romansmatch and re-implementation of the getNumber function. New terms in the translation map and update of the configuration
This commit is contained in:
parent
f791730330
commit
49f9beb4a8
|
@ -17,6 +17,7 @@ import java.io.IOException;
|
|||
import java.io.StringWriter;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
|
@ -48,6 +49,8 @@ public abstract class AbstractPaceFunctions {
|
|||
//doi prefix for normalization
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||
|
||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||
|
||||
protected String concat(final List<String> l) {
|
||||
|
@ -92,7 +95,18 @@ public abstract class AbstractPaceFunctions {
|
|||
}
|
||||
|
||||
protected String getNumbers(final String s) {
|
||||
return s.replaceAll("\\D", "");
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isNumber(t)? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public boolean isNumber(String strNum) {
|
||||
if (strNum == null) {
|
||||
return false;
|
||||
}
|
||||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
|
|
|
@ -17,9 +17,13 @@ public class NumbersMatch extends AbstractComparator {
|
|||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
//extracts numbers from the field
|
||||
String numbers1 = getNumbers(nfd(a));
|
||||
String numbers2 = getNumbers(nfd(b));
|
||||
|
||||
if (numbers1.isEmpty() && numbers2.isEmpty())
|
||||
return 1.0;
|
||||
|
||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("romansMatch")
|
||||
public class RomansMatch extends AbstractComparator {
|
||||
|
||||
|
||||
public RomansMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
//extracts romans from the field
|
||||
String romans1 = getRomans(nfd(a));
|
||||
String romans2 = getRomans(nfd(b));
|
||||
|
||||
if (romans1.isEmpty() && romans2.isEmpty())
|
||||
return 1.0;
|
||||
|
||||
if (romans1.isEmpty() || romans2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
if (romans1.equals(romans2))
|
||||
return 1.0;
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
}
|
|
@ -11,7 +11,6 @@ ag
|
|||
alle
|
||||
allein
|
||||
allem
|
||||
allen
|
||||
aller
|
||||
allerdings
|
||||
alles
|
||||
|
|
|
@ -54,28 +54,9 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
|||
//particular cases
|
||||
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
||||
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
|
||||
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||
}
|
||||
|
||||
// @Test
|
||||
// public void testJaroWinklerNormalizedName6() {
|
||||
//
|
||||
// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
// double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
|
||||
//
|
||||
// System.out.println("result = " + result);
|
||||
// assertTrue(result > 0.9);
|
||||
//
|
||||
// }
|
||||
// @Test
|
||||
// public void testJaroWinklerNormalizedName10(){
|
||||
//
|
||||
// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
//
|
||||
// double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
|
||||
//
|
||||
// System.out.println("result = " + result);
|
||||
// }
|
||||
|
||||
@Test
|
||||
public void keywordMatchTest(){
|
||||
params.put("threshold", "0.5");
|
||||
|
@ -88,6 +69,10 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
|||
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
||||
assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
||||
assertEquals(0.0, keywordMatch.distance("University College London", "University of London", conf));
|
||||
assertEquals(0.0, keywordMatch.distance("Washington State University", "University of Washington", conf));
|
||||
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -108,7 +93,27 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
|||
final NumbersMatch numbersMatch = new NumbersMatch(params);
|
||||
|
||||
assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf));
|
||||
assertEquals(1.0, numbersMatch.distance("Universit<C9><U3> de Rennes 2", "Universita di Rennes 2", conf));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void romansMatchTest(){
|
||||
|
||||
final RomansMatch romansMatch = new RomansMatch(params);
|
||||
|
||||
assertEquals(-1.0, romansMatch.distance("University of Paris X", "Universita di Parigi", conf));
|
||||
assertEquals(0.0, romansMatch.distance("University of Paris IX", "University of Paris X", conf));
|
||||
assertEquals(1.0, romansMatch.distance("University of Paris VII", "University of Paris VII", conf));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void jaroWinklerNormalizedNameTest() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
|
||||
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||
|
|
Loading…
Reference in New Issue