forked from antonis.lempesis/dnet-hadoop
addition of doi normalization in PidMatch comparator, addition of keywordsclustering (clustering based on terms in the translation maps for the organizations), minor changes
This commit is contained in:
parent
2dcffb965f
commit
15bec5e876
|
@ -0,0 +1,39 @@
|
||||||
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
@ClusteringClass("keywordsclustering")
|
||||||
|
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
|
|
||||||
|
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||||
|
|
||||||
|
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||||
|
|
||||||
|
public KeywordsClustering(Map<String, Integer> params) {
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Collection<String> doApply(String s) {
|
||||||
|
|
||||||
|
List<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
|
||||||
|
List<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
|
||||||
|
|
||||||
|
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||||
|
|
||||||
|
int size = 0;
|
||||||
|
for (String keyword: keywords){
|
||||||
|
for (String city: cities) {
|
||||||
|
combinations.add(keyword+"-"+city);
|
||||||
|
if (++size>params.getOrDefault("max", 2)) {
|
||||||
|
return combinations;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return combinations;
|
||||||
|
}
|
||||||
|
}
|
|
@ -40,6 +40,8 @@ public abstract class AbstractPaceFunctions {
|
||||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||||
|
|
||||||
|
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||||
|
|
||||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||||
|
|
||||||
protected String concat(final List<String> l) {
|
protected String concat(final List<String> l) {
|
||||||
|
@ -313,5 +315,41 @@ public abstract class AbstractPaceFunctions {
|
||||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String normalizePid(String pid) {
|
||||||
|
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
//get the list of codes into the input string
|
||||||
|
public List<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){
|
||||||
|
|
||||||
|
String s = cleanup(s1);
|
||||||
|
|
||||||
|
s = filterAllStopWords(s);
|
||||||
|
|
||||||
|
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||||
|
|
||||||
|
List<String> codes = new ArrayList<>();
|
||||||
|
|
||||||
|
if (tokens.size()<windowSize)
|
||||||
|
windowSize = tokens.size();
|
||||||
|
|
||||||
|
int length = windowSize;
|
||||||
|
|
||||||
|
while (length != 0) {
|
||||||
|
|
||||||
|
for (int i = 0; i<=tokens.size()-length; i++){
|
||||||
|
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
|
||||||
|
if (translationMap.containsKey(candidate)) {
|
||||||
|
codes.add(translationMap.get(candidate));
|
||||||
|
s = s.replace(candidate, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokens = Arrays.asList(s.split(" "));
|
||||||
|
length-=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return codes;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,9 +52,11 @@ public class PidMatch extends AbstractCondition {
|
||||||
return new ConditionEval(cond, a, b, result);
|
return new ConditionEval(cond, a, b, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//lowercase + normalization of the pid before adding it to the set
|
||||||
private Set<String> toHashSet(List<Pid> pbl) {
|
private Set<String> toHashSet(List<Pid> pbl) {
|
||||||
|
|
||||||
return pbl.stream()
|
return pbl.stream()
|
||||||
.map(pid -> pid.getType() + pid.getValue())
|
.map(pid -> pid.getType() + normalizePid(pid.getValue()))
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -43,8 +43,11 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
String ca = cleanup(a);
|
String ca = cleanup(a);
|
||||||
String cb = cleanup(b);
|
String cb = cleanup(b);
|
||||||
|
|
||||||
ca = removeStopwords(ca);
|
ca = normalize(ca);
|
||||||
cb = removeStopwords(cb);
|
cb = normalize(cb);
|
||||||
|
|
||||||
|
ca = filterAllStopWords(ca);
|
||||||
|
cb = filterAllStopWords(cb);
|
||||||
|
|
||||||
//replace keywords with codes
|
//replace keywords with codes
|
||||||
String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue());
|
String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue());
|
||||||
|
@ -80,16 +83,4 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String removeStopwords(String s) {
|
|
||||||
String normString = normalize(s);
|
|
||||||
|
|
||||||
normString = filterStopWords(normString, stopwordsIt);
|
|
||||||
normString = filterStopWords(normString, stopwordsEn);
|
|
||||||
normString = filterStopWords(normString, stopwordsDe);
|
|
||||||
normString = filterStopWords(normString, stopwordsFr);
|
|
||||||
normString = filterStopWords(normString, stopwordsPt);
|
|
||||||
normString = filterStopWords(normString, stopwordsEs);
|
|
||||||
|
|
||||||
return normString;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7915,7 +7915,7 @@ city::5134086;Rochester;Ga-sko-sa-ga;Gaskosago;ROC;Rocestera;Rocesteris;Rocestro
|
||||||
city::5136454;Schenectady;SCH;Schenectady;Shinetadi;Skanehtati;Skanéhtati;Skenektadi;Skenektadis;Skunektadi;seukinegteodi;si ke nei ke ta di;sknktdy nywywrk;sknyktady;sqnqtdy;sukenekutadi;Скенектади;Скенектаді;Скънектади;Шинетади;סקנקטדי;سكنيكتادي;سکنکتدی، نیویورک;سکینکٹڈی ، نیویارک;سکینیکٹیڈی، نیو یارک;スケネクタディ;斯克内克塔迪;스키넥터디;
|
city::5136454;Schenectady;SCH;Schenectady;Shinetadi;Skanehtati;Skanéhtati;Skenektadi;Skenektadis;Skunektadi;seukinegteodi;si ke nei ke ta di;sknktdy nywywrk;sknyktady;sqnqtdy;sukenekutadi;Скенектади;Скенектаді;Скънектади;Шинетади;סקנקטדי;سكنيكتادي;سکنکتدی، نیویورک;سکینکٹڈی ، نیویارک;سکینیکٹیڈی، نیو یارک;スケネクタディ;斯克内克塔迪;스키넥터디;
|
||||||
city::5137849;Sheepshead Bay;;
|
city::5137849;Sheepshead Bay;;
|
||||||
city::5139568;Staten Island;Borough of Staten Island;Staten Island;
|
city::5139568;Staten Island;Borough of Staten Island;Staten Island;
|
||||||
city::5140405;Syracuse;Bogardus Corners;Cossitts Corners;Gorad Sirakjus;Kah-ya-hoo-neh;Ken-tue-ho-ne;Milan;Na-ta-dunk;SYR;Siracusa;Sirak'jus;Sirakjus;Sirakjuz;Sirakuso;Sirakuz;South Salina;Sy-kuse;Syracusae;Syracuse;Syracuse i New York;Tu-na-ten-tonk;sayrakywz;shirakyusu;shirakyuzu;sileokyuseu;sirekyuja;syrakywz;syrakywz nyw yark;syrakywz nywywrk;syrqywz;xi la qiu ci;Горад Сіракюс;Сиракуз;Сиракьюс;Сиракюз;Сиракјус;Сіракюс;סירקיוז;سائراکیوز;سيراكيوز;سیراکیوز، نیو یارک;سیراکیوز، نیویورک;सिरॅक्युज;სირაკიუსი;シラキュース;シラキューズ;锡拉丘兹;시러큐스;
|
city::5140405;Syracuse;Bogardus Corners;Cossitts Corners;Gorad Sirakjus;Kah-ya-hoo-neh;Ken-tue-ho-ne;Na-ta-dunk;SYR;Siracusa;Sirak'jus;Sirakjus;Sirakjuz;Sirakuso;Sirakuz;South Salina;Sy-kuse;Syracusae;Syracuse;Syracuse i New York;Tu-na-ten-tonk;sayrakywz;shirakyusu;shirakyuzu;sileokyuseu;sirekyuja;syrakywz;syrakywz nyw yark;syrakywz nywywrk;syrqywz;xi la qiu ci;Горад Сіракюс;Сиракуз;Сиракьюс;Сиракюз;Сиракјус;Сіракюс;סירקיוז;سائراکیوز;سيراكيوز;سیراکیوز، نیو یارک;سیراکیوز، نیویورک;सिरॅक्युज;სირაკიუსი;シラキュース;シラキューズ;锡拉丘兹;시러큐스;
|
||||||
city::5142056;Utica;Fort Schuyler;Gorad Jutyka;Iotekha;Iotékha;Jutika;Nundadasis;Old Fort Schuyler;Tevadahahtodague;Twa-dah-ah-lo-dah-que;UCA;Unungagages;Utica;Utika;Yanundadasis;ZUA;awtyka;you ti ka;yutika;ywtyka nywywrk;ywtyqh;Јутика;Горад Ютыка;Утика;Ютика;Ютіка;יוטיקה;أوتيكا;یوتیکا، نیویورک;یوٹیکا، نیو یارک;ユーティカ;由提卡;
|
city::5142056;Utica;Fort Schuyler;Gorad Jutyka;Iotekha;Iotékha;Jutika;Nundadasis;Old Fort Schuyler;Tevadahahtodague;Twa-dah-ah-lo-dah-que;UCA;Unungagages;Utica;Utika;Yanundadasis;ZUA;awtyka;you ti ka;yutika;ywtyka nywywrk;ywtyqh;Јутика;Горад Ютыка;Утика;Ютика;Ютіка;יוטיקה;أوتيكا;یوتیکا، نیویورک;یوٹیکا، نیو یارک;ユーティカ;由提卡;
|
||||||
city::5143056;Wakefield;;
|
city::5143056;Wakefield;;
|
||||||
city::5143307;Washington Heights;Harlem Heights;Pen-a-bick;Washington Heights;washintonhaitsu;wosingteonhaicheu;ワシントンハイツ;워싱턴하이츠;
|
city::5143307;Washington Heights;Harlem Heights;Pen-a-bick;Washington Heights;washintonhaitsu;wosingteonhaicheu;ワシントンハイツ;워싱턴하이츠;
|
||||||
|
|
Can't render this file because it is too large.
|
|
@ -332,7 +332,6 @@ male
|
||||||
malgrado
|
malgrado
|
||||||
malissimo
|
malissimo
|
||||||
mancanza
|
mancanza
|
||||||
marche
|
|
||||||
me
|
me
|
||||||
medesimo
|
medesimo
|
||||||
mediante
|
mediante
|
||||||
|
|
|
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
|
||||||
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
||||||
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
||||||
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
||||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο
|
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university
|
||||||
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
||||||
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
||||||
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
||||||
|
|
|
|
@ -117,4 +117,19 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
System.out.println(cf.apply(Lists.newArrayList(person(s))));
|
System.out.println(cf.apply(Lists.newArrayList(person(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKeywordsClustering() {
|
||||||
|
|
||||||
|
final ClusteringFunction cf = new KeywordsClustering(params);
|
||||||
|
final String s = "Polytechnic University of Turin";
|
||||||
|
System.out.println(s);
|
||||||
|
System.out.println(cf.apply(Lists.newArrayList(title(s))));
|
||||||
|
|
||||||
|
final String s1 = "POLITECNICO DI TORINO";
|
||||||
|
System.out.println(s1);
|
||||||
|
System.out.println(cf.apply(Lists.newArrayList(title(s1))));
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import static junit.framework.Assert.assertEquals;
|
||||||
|
import static junit.framework.Assert.assertTrue;
|
||||||
|
|
||||||
|
public class PaceFunctionTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void normalizePidTest(){
|
||||||
|
|
||||||
|
assertEquals("identifier", normalizePid("IdentifIer"));
|
||||||
|
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
|
||||||
|
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
|
||||||
|
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void filterAllStopwordsTest(){
|
||||||
|
|
||||||
|
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.AbstractPaceTest;
|
||||||
|
|
||||||
|
public class ConditionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
|
}
|
|
@ -104,4 +104,14 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJaroWinklerNormalizedName7() {
|
||||||
|
|
||||||
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO");
|
||||||
|
|
||||||
|
System.out.println("result = " + result);
|
||||||
|
assertTrue(result> 0.9);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue