forked from D-Net/dnet-hadoop
implementation of romansmatch and re-implementation of the getNumber function. New terms in the translation map and update of the configuration
This commit is contained in:
parent
f791730330
commit
49f9beb4a8
|
@ -17,6 +17,7 @@ import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -48,6 +49,8 @@ public abstract class AbstractPaceFunctions {
|
||||||
//doi prefix for normalization
|
//doi prefix for normalization
|
||||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||||
|
|
||||||
|
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||||
|
|
||||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||||
|
|
||||||
protected String concat(final List<String> l) {
|
protected String concat(final List<String> l) {
|
||||||
|
@ -92,7 +95,18 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getNumbers(final String s) {
|
protected String getNumbers(final String s) {
|
||||||
return s.replaceAll("\\D", "");
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
for (final String t : s.split(" ")) {
|
||||||
|
sb.append(isNumber(t)? t : "");
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isNumber(String strNum) {
|
||||||
|
if (strNum == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return numberPattern.matcher(strNum).matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static String fixAliases(final String s) {
|
protected static String fixAliases(final String s) {
|
||||||
|
|
|
@ -17,9 +17,13 @@ public class NumbersMatch extends AbstractComparator {
|
||||||
@Override
|
@Override
|
||||||
public double distance(String a, String b, Config conf) {
|
public double distance(String a, String b, Config conf) {
|
||||||
|
|
||||||
|
//extracts numbers from the field
|
||||||
String numbers1 = getNumbers(nfd(a));
|
String numbers1 = getNumbers(nfd(a));
|
||||||
String numbers2 = getNumbers(nfd(b));
|
String numbers2 = getNumbers(nfd(b));
|
||||||
|
|
||||||
|
if (numbers1.isEmpty() && numbers2.isEmpty())
|
||||||
|
return 1.0;
|
||||||
|
|
||||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||||
return -1.0;
|
return -1.0;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ComparatorClass("romansMatch")
|
||||||
|
public class RomansMatch extends AbstractComparator {
|
||||||
|
|
||||||
|
|
||||||
|
public RomansMatch(Map<String, String> params) {
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double distance(String a, String b, Config conf) {
|
||||||
|
|
||||||
|
//extracts romans from the field
|
||||||
|
String romans1 = getRomans(nfd(a));
|
||||||
|
String romans2 = getRomans(nfd(b));
|
||||||
|
|
||||||
|
if (romans1.isEmpty() && romans2.isEmpty())
|
||||||
|
return 1.0;
|
||||||
|
|
||||||
|
if (romans1.isEmpty() || romans2.isEmpty())
|
||||||
|
return -1.0;
|
||||||
|
|
||||||
|
if (romans1.equals(romans2))
|
||||||
|
return 1.0;
|
||||||
|
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -11,7 +11,6 @@ ag
|
||||||
alle
|
alle
|
||||||
allein
|
allein
|
||||||
allem
|
allem
|
||||||
allen
|
|
||||||
aller
|
aller
|
||||||
allerdings
|
allerdings
|
||||||
alles
|
alles
|
||||||
|
|
|
@ -54,28 +54,9 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
//particular cases
|
//particular cases
|
||||||
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
||||||
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
|
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
|
||||||
|
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
|
||||||
// public void testJaroWinklerNormalizedName6() {
|
|
||||||
//
|
|
||||||
// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
// double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
|
|
||||||
//
|
|
||||||
// System.out.println("result = " + result);
|
|
||||||
// assertTrue(result > 0.9);
|
|
||||||
//
|
|
||||||
// }
|
|
||||||
// @Test
|
|
||||||
// public void testJaroWinklerNormalizedName10(){
|
|
||||||
//
|
|
||||||
// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
//
|
|
||||||
// double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
|
|
||||||
//
|
|
||||||
// System.out.println("result = " + result);
|
|
||||||
// }
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void keywordMatchTest(){
|
public void keywordMatchTest(){
|
||||||
params.put("threshold", "0.5");
|
params.put("threshold", "0.5");
|
||||||
|
@ -88,6 +69,10 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
||||||
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
||||||
assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
||||||
|
assertEquals(0.0, keywordMatch.distance("University College London", "University of London", conf));
|
||||||
|
assertEquals(0.0, keywordMatch.distance("Washington State University", "University of Washington", conf));
|
||||||
|
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -108,6 +93,26 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
final NumbersMatch numbersMatch = new NumbersMatch(params);
|
final NumbersMatch numbersMatch = new NumbersMatch(params);
|
||||||
|
|
||||||
assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf));
|
assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf));
|
||||||
|
assertEquals(1.0, numbersMatch.distance("Universit<C9><U3> de Rennes 2", "Universita di Rennes 2", conf));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void romansMatchTest(){
|
||||||
|
|
||||||
|
final RomansMatch romansMatch = new RomansMatch(params);
|
||||||
|
|
||||||
|
assertEquals(-1.0, romansMatch.distance("University of Paris X", "Universita di Parigi", conf));
|
||||||
|
assertEquals(0.0, romansMatch.distance("University of Paris IX", "University of Paris X", conf));
|
||||||
|
assertEquals(1.0, romansMatch.distance("University of Paris VII", "University of Paris VII", conf));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void jaroWinklerNormalizedNameTest() {
|
||||||
|
|
||||||
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
|
||||||
|
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
|
||||||
|
System.out.println("result = " + result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
"legalname" : []
|
"legalname" : []
|
||||||
},
|
},
|
||||||
"synonyms": {
|
"synonyms": {
|
||||||
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
|
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
||||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||||
|
|
Loading…
Reference in New Issue