From 49f9beb4a8c45844de037e5f61bd27622326b773 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 28 Nov 2019 16:54:44 +0100 Subject: [PATCH] implementation of romansmatch and re-implementation of the getNumber function. New terms in the translation map and update of the configuration --- .../pace/common/AbstractPaceFunctions.java | 16 ++++++- .../eu/dnetlib/pace/tree/NumbersMatch.java | 4 ++ .../eu/dnetlib/pace/tree/RomansMatch.java | 35 +++++++++++++++ .../eu/dnetlib/pace/config/stopwords_de.txt | 1 - .../pace/comparators/ComparatorTest.java | 45 ++++++++++--------- .../pace/config/organization.current.conf | 2 +- 6 files changed, 80 insertions(+), 23 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 71954a394..b9f30ff69 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -17,6 +17,7 @@ import java.io.IOException; import java.io.StringWriter; import java.text.Normalizer; import java.util.*; +import java.util.regex.Pattern; import java.util.stream.Collectors; /** @@ -48,6 +49,8 @@ public abstract class AbstractPaceFunctions { //doi prefix for normalization public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); + protected final static FieldList EMPTY_FIELD = new FieldListImpl(); protected String concat(final List l) { @@ -92,7 +95,18 @@ public abstract class AbstractPaceFunctions { } protected String getNumbers(final String s) { - return s.replaceAll("\\D", ""); + final StringBuilder sb = new StringBuilder(); + for (final String t : s.split(" ")) { + sb.append(isNumber(t)? t : ""); + } + return sb.toString(); + } + + public boolean isNumber(String strNum) { + if (strNum == null) { + return false; + } + return numberPattern.matcher(strNum).matches(); } protected static String fixAliases(final String s) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java index c2300d207..ce60a672a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java @@ -17,9 +17,13 @@ public class NumbersMatch extends AbstractComparator { @Override public double distance(String a, String b, Config conf) { + //extracts numbers from the field String numbers1 = getNumbers(nfd(a)); String numbers2 = getNumbers(nfd(b)); + if (numbers1.isEmpty() && numbers2.isEmpty()) + return 1.0; + if (numbers1.isEmpty() || numbers2.isEmpty()) return -1.0; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java new file mode 100644 index 000000000..bdbde9610 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java @@ -0,0 +1,35 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("romansMatch") +public class RomansMatch extends AbstractComparator { + + + public RomansMatch(Map params) { + super(params); + } + + @Override + public double distance(String a, String b, Config conf) { + + //extracts romans from the field + String romans1 = getRomans(nfd(a)); + String romans2 = getRomans(nfd(b)); + + if (romans1.isEmpty() && romans2.isEmpty()) + return 1.0; + + if (romans1.isEmpty() || romans2.isEmpty()) + return -1.0; + + if (romans1.equals(romans2)) + return 1.0; + + return 0.0; + } +} diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_de.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_de.txt index 24666a649..c9478968e 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_de.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_de.txt @@ -11,7 +11,6 @@ ag alle allein allem -allen aller allerdings alles diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 7e69b0e3a..905fac4bd 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -54,28 +54,9 @@ public class ComparatorTest extends AbstractPaceFunctions { //particular cases assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf)); + assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf)); } - // @Test -// public void testJaroWinklerNormalizedName6() { -// -// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); -// double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf); -// -// System.out.println("result = " + result); -// assertTrue(result > 0.9); -// -// } -// @Test -// public void testJaroWinklerNormalizedName10(){ -// -// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); -// -// double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf); -// -// System.out.println("result = " + result); -// } - @Test public void keywordMatchTest(){ params.put("threshold", "0.5"); @@ -88,6 +69,10 @@ public class ComparatorTest extends AbstractPaceFunctions { assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf)); assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); + assertEquals(0.0, keywordMatch.distance("University College London", "University of London", conf)); + assertEquals(0.0, keywordMatch.distance("Washington State University", "University of Washington", conf)); + assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf)); + } @@ -108,7 +93,27 @@ public class ComparatorTest extends AbstractPaceFunctions { final NumbersMatch numbersMatch = new NumbersMatch(params); assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf)); + assertEquals(1.0, numbersMatch.distance("Universit de Rennes 2", "Universita di Rennes 2", conf)); } + @Test + public void romansMatchTest(){ + + final RomansMatch romansMatch = new RomansMatch(params); + + assertEquals(-1.0, romansMatch.distance("University of Paris X", "Universita di Parigi", conf)); + assertEquals(0.0, romansMatch.distance("University of Paris IX", "University of Paris X", conf)); + assertEquals(1.0, romansMatch.distance("University of Paris VII", "University of Paris VII", conf)); + } + + @Test + public void jaroWinklerNormalizedNameTest() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + + double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf); + System.out.println("result = " + result); + } + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf index b2ab6ae92..f92655f3f 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf @@ -33,7 +33,7 @@ "legalname" : [] }, "synonyms": { - "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],