From 15bec5e876c02436633923955fb0ee93ed6299a2 Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 8 Jul 2019 09:44:02 +0200 Subject: [PATCH] addition of doi normalization in PidMatch comparator, addition of keywordsclustering (clustering based on terms in the translation maps for the organizations), minor changes --- .../pace/clustering/KeywordsClustering.java | 39 +++++++++++++++++++ .../pace/common/AbstractPaceFunctions.java | 38 ++++++++++++++++++ .../eu/dnetlib/pace/condition/PidMatch.java | 4 +- .../algo/JaroWinklerNormalizedName.java | 19 +++------ .../eu/dnetlib/pace/config/city_map.csv | 2 +- .../eu/dnetlib/pace/config/stopwords_it.txt | 1 - .../dnetlib/pace/config/translation_map.csv | 2 +- .../clustering/ClusteringFunctionTest.java | 15 +++++++ .../dnetlib/pace/common/PaceFunctionTest.java | 25 ++++++++++++ .../dnetlib/pace/condition/ConditionTest.java | 7 ++++ .../pace/distance/DistanceAlgoTest.java | 10 +++++ 11 files changed, 144 insertions(+), 18 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/condition/ConditionTest.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java new file mode 100644 index 000000000..1aac6c8a5 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -0,0 +1,39 @@ +package eu.dnetlib.pace.clustering; + +import com.google.common.base.Joiner; +import eu.dnetlib.pace.common.AbstractPaceFunctions; + +import java.util.*; + +@ClusteringClass("keywordsclustering") +public class KeywordsClustering extends AbstractClusteringFunction { + + private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv"); + + private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); + + public KeywordsClustering(Map params) { + super(params); + } + + @Override + protected Collection doApply(String s) { + + List keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4)); + List cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4)); + + final Collection combinations = new LinkedHashSet(); + + int size = 0; + for (String keyword: keywords){ + for (String city: cities) { + combinations.add(keyword+"-"+city); + if (++size>params.getOrDefault("max", 2)) { + return combinations; + } + } + } + + return combinations; + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 977c5c46e..8c99342a7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -40,6 +40,8 @@ public abstract class AbstractPaceFunctions { private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; + public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + protected final static FieldList EMPTY_FIELD = new FieldListImpl(); protected String concat(final List l) { @@ -313,5 +315,41 @@ public abstract class AbstractPaceFunctions { return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); } + public String normalizePid(String pid) { + return pid.toLowerCase().replaceAll(DOI_PREFIX, ""); + } + + //get the list of codes into the input string + public List getCodes(String s1, Map translationMap, int windowSize){ + + String s = cleanup(s1); + + s = filterAllStopWords(s); + + List tokens = Arrays.asList(s.toLowerCase().split(" ")); + + List codes = new ArrayList<>(); + + if (tokens.size() toHashSet(List pbl) { + return pbl.stream() - .map(pid -> pid.getType() + pid.getValue()) + .map(pid -> pid.getType() + normalizePid(pid.getValue())) .collect(Collectors.toCollection(HashSet::new)); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java index fea74af37..285575a1b 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java @@ -43,8 +43,11 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { String ca = cleanup(a); String cb = cleanup(b); - ca = removeStopwords(ca); - cb = removeStopwords(cb); + ca = normalize(ca); + cb = normalize(cb); + + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); //replace keywords with codes String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue()); @@ -80,16 +83,4 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { return d; } - public String removeStopwords(String s) { - String normString = normalize(s); - - normString = filterStopWords(normString, stopwordsIt); - normString = filterStopWords(normString, stopwordsEn); - normString = filterStopWords(normString, stopwordsDe); - normString = filterStopWords(normString, stopwordsFr); - normString = filterStopWords(normString, stopwordsPt); - normString = filterStopWords(normString, stopwordsEs); - - return normString; - } } diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv index 5f70a373b..e2d48551d 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv @@ -7915,7 +7915,7 @@ city::5134086;Rochester;Ga-sko-sa-ga;Gaskosago;ROC;Rocestera;Rocesteris;Rocestro city::5136454;Schenectady;SCH;Schenectady;Shinetadi;Skanehtati;Skanéhtati;Skenektadi;Skenektadis;Skunektadi;seukinegteodi;si ke nei ke ta di;sknktdy nywywrk;sknyktady;sqnqtdy;sukenekutadi;Скенектади;Скенектаді;Скънектади;Шинетади;סקנקטדי;سكنيكتادي;سکنکتدی، نیویورک;سکینکٹڈی ، نیویارک;سکینیکٹیڈی، نیو یارک;スケネクタディ;斯克内克塔迪;스키넥터디; city::5137849;Sheepshead Bay;; city::5139568;Staten Island;Borough of Staten Island;Staten Island; -city::5140405;Syracuse;Bogardus Corners;Cossitts Corners;Gorad Sirakjus;Kah-ya-hoo-neh;Ken-tue-ho-ne;Milan;Na-ta-dunk;SYR;Siracusa;Sirak'jus;Sirakjus;Sirakjuz;Sirakuso;Sirakuz;South Salina;Sy-kuse;Syracusae;Syracuse;Syracuse i New York;Tu-na-ten-tonk;sayrakywz;shirakyusu;shirakyuzu;sileokyuseu;sirekyuja;syrakywz;syrakywz nyw yark;syrakywz nywywrk;syrqywz;xi la qiu ci;Горад Сіракюс;Сиракуз;Сиракьюс;Сиракюз;Сиракјус;Сіракюс;סירקיוז;سائراکیوز;سيراكيوز;سیراکیوز، نیو یارک;سیراکیوز، نیویورک;सिरॅक्युज;სირაკიუსი;シラキュース;シラキューズ;锡拉丘兹;시러큐스; +city::5140405;Syracuse;Bogardus Corners;Cossitts Corners;Gorad Sirakjus;Kah-ya-hoo-neh;Ken-tue-ho-ne;Na-ta-dunk;SYR;Siracusa;Sirak'jus;Sirakjus;Sirakjuz;Sirakuso;Sirakuz;South Salina;Sy-kuse;Syracusae;Syracuse;Syracuse i New York;Tu-na-ten-tonk;sayrakywz;shirakyusu;shirakyuzu;sileokyuseu;sirekyuja;syrakywz;syrakywz nyw yark;syrakywz nywywrk;syrqywz;xi la qiu ci;Горад Сіракюс;Сиракуз;Сиракьюс;Сиракюз;Сиракјус;Сіракюс;סירקיוז;سائراکیوز;سيراكيوز;سیراکیوز، نیو یارک;سیراکیوز، نیویورک;सिरॅक्युज;სირაკიუსი;シラキュース;シラキューズ;锡拉丘兹;시러큐스; city::5142056;Utica;Fort Schuyler;Gorad Jutyka;Iotekha;Iotékha;Jutika;Nundadasis;Old Fort Schuyler;Tevadahahtodague;Twa-dah-ah-lo-dah-que;UCA;Unungagages;Utica;Utika;Yanundadasis;ZUA;awtyka;you ti ka;yutika;ywtyka nywywrk;ywtyqh;Јутика;Горад Ютыка;Утика;Ютика;Ютіка;יוטיקה;أوتيكا;یوتیکا، نیویورک;یوٹیکا، نیو یارک;ユーティカ;由提卡; city::5143056;Wakefield;; city::5143307;Washington Heights;Harlem Heights;Pen-a-bick;Washington Heights;washintonhaitsu;wosingteonhaicheu;ワシントンハイツ;워싱턴하이츠; diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt index 2ce975b13..8d0ceb776 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt @@ -332,7 +332,6 @@ male malgrado malissimo mancanza -marche me medesimo mediante diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index ef49c2f9e..dc77a6057 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο key::39;bureau;ufficio;bureau;офис;bureau;γραφείο key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία -key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο +key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 71c23a811..270a44b65 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -117,4 +117,19 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(cf.apply(Lists.newArrayList(person(s)))); } + @Test + public void testKeywordsClustering() { + + final ClusteringFunction cf = new KeywordsClustering(params); + final String s = "Polytechnic University of Turin"; + System.out.println(s); + System.out.println(cf.apply(Lists.newArrayList(title(s)))); + + final String s1 = "POLITECNICO DI TORINO"; + System.out.println(s1); + System.out.println(cf.apply(Lists.newArrayList(title(s1)))); + + + } + } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java new file mode 100644 index 000000000..e15b54331 --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java @@ -0,0 +1,25 @@ +package eu.dnetlib.pace.common; + +import org.junit.Test; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; + +public class PaceFunctionTest extends AbstractPaceFunctions { + + @Test + public void normalizePidTest(){ + + assertEquals("identifier", normalizePid("IdentifIer")); + assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347")); + assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI")); + assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI")); + + } + + @Test + public void filterAllStopwordsTest(){ + + assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche")); + } +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/condition/ConditionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/condition/ConditionTest.java new file mode 100644 index 000000000..57047d8c9 --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/condition/ConditionTest.java @@ -0,0 +1,7 @@ +package eu.dnetlib.pace.condition; + +import eu.dnetlib.pace.AbstractPaceTest; + +public class ConditionTest extends AbstractPaceTest { + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 3943e4f8e..6f196ec4f 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -104,4 +104,14 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { } + @Test + public void testJaroWinklerNormalizedName7() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO"); + + System.out.println("result = " + result); + assertTrue(result> 0.9); + } + }