implementation of romansmatch and re-implementation of the getNumber function. New terms in the translation map and update of the configuration

This commit is contained in:
miconis 2019-11-28 16:54:44 +01:00
parent f791730330
commit 49f9beb4a8
6 changed files with 80 additions and 23 deletions

View File

@ -17,6 +17,7 @@ import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
/** /**
@ -48,6 +49,8 @@ public abstract class AbstractPaceFunctions {
//doi prefix for normalization //doi prefix for normalization
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
protected final static FieldList EMPTY_FIELD = new FieldListImpl(); protected final static FieldList EMPTY_FIELD = new FieldListImpl();
protected String concat(final List<String> l) { protected String concat(final List<String> l) {
@ -92,7 +95,18 @@ public abstract class AbstractPaceFunctions {
} }
protected String getNumbers(final String s) { protected String getNumbers(final String s) {
return s.replaceAll("\\D", ""); final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) {
sb.append(isNumber(t)? t : "");
}
return sb.toString();
}
public boolean isNumber(String strNum) {
if (strNum == null) {
return false;
}
return numberPattern.matcher(strNum).matches();
} }
protected static String fixAliases(final String s) { protected static String fixAliases(final String s) {

View File

@ -17,9 +17,13 @@ public class NumbersMatch extends AbstractComparator {
@Override @Override
public double distance(String a, String b, Config conf) { public double distance(String a, String b, Config conf) {
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a)); String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b)); String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() && numbers2.isEmpty())
return 1.0;
if (numbers1.isEmpty() || numbers2.isEmpty()) if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0; return -1.0;

View File

@ -0,0 +1,35 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("romansMatch")
public class RomansMatch extends AbstractComparator {
public RomansMatch(Map<String, String> params) {
super(params);
}
@Override
public double distance(String a, String b, Config conf) {
//extracts romans from the field
String romans1 = getRomans(nfd(a));
String romans2 = getRomans(nfd(b));
if (romans1.isEmpty() && romans2.isEmpty())
return 1.0;
if (romans1.isEmpty() || romans2.isEmpty())
return -1.0;
if (romans1.equals(romans2))
return 1.0;
return 0.0;
}
}

View File

@ -11,7 +11,6 @@ ag
alle alle
allein allein
allem allem
allen
aller aller
allerdings allerdings
alles alles

View File

@ -54,28 +54,9 @@ public class ComparatorTest extends AbstractPaceFunctions {
//particular cases //particular cases
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf)); assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
} }
// @Test
// public void testJaroWinklerNormalizedName6() {
//
// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
// double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
//
// System.out.println("result = " + result);
// assertTrue(result > 0.9);
//
// }
// @Test
// public void testJaroWinklerNormalizedName10(){
//
// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
//
// double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
//
// System.out.println("result = " + result);
// }
@Test @Test
public void keywordMatchTest(){ public void keywordMatchTest(){
params.put("threshold", "0.5"); params.put("threshold", "0.5");
@ -88,6 +69,10 @@ public class ComparatorTest extends AbstractPaceFunctions {
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf)); assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.0, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.0, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
} }
@ -108,7 +93,27 @@ public class ComparatorTest extends AbstractPaceFunctions {
final NumbersMatch numbersMatch = new NumbersMatch(params); final NumbersMatch numbersMatch = new NumbersMatch(params);
assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf)); assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf));
assertEquals(1.0, numbersMatch.distance("Universit<C9><U3> de Rennes 2", "Universita di Rennes 2", conf));
} }
@Test
public void romansMatchTest(){
final RomansMatch romansMatch = new RomansMatch(params);
assertEquals(-1.0, romansMatch.distance("University of Paris X", "Universita di Parigi", conf));
assertEquals(0.0, romansMatch.distance("University of Paris IX", "University of Paris X", conf));
assertEquals(1.0, romansMatch.distance("University of Paris VII", "University of Paris VII", conf));
}
@Test
public void jaroWinklerNormalizedNameTest() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result);
}
} }

View File

@ -33,7 +33,7 @@
"legalname" : [] "legalname" : []
}, },
"synonyms": { "synonyms": {
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],