addition of doi normalization in PidMatch comparator, addition of keywordsclustering (clustering based on terms in the translation maps for the organizations), minor changes

This commit is contained in:
miconis 2019-07-08 09:44:02 +02:00
parent 9f6fb0e030
commit 2b866cfbeb
14 changed files with 160 additions and 24 deletions

View File

@ -61,10 +61,9 @@ public class SparkLocalTest {
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
}).groupByKey();//group documents basing on the key
// blocks = blocks.filter(b -> Iterables.size(b._2())>2);
// vertexes = blocks.flatMap(b -> b._2().iterator()).map(t -> new Tuple2<Object, MapDocument>((long) t.getIdentifier().hashCode(), t)).rdd();
//print blocks
// blocks = blocks.filter(b -> Iterables.size(b._2())>1);
//// vertexes = blocks.flatMap(b -> b._2().iterator()).map(t -> new Tuple2<Object, MapDocument>((long) t.getIdentifier().hashCode(), t)).rdd();
// blocks.map(group -> new DocumentsBlock(group._1(), group._2())).foreach(b -> System.out.println(b));
//create relations by comparing only elements in the same group

View File

@ -14,7 +14,8 @@
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"strictConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
@ -25,7 +26,7 @@
],
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "false", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }

View File

@ -27,4 +27,14 @@
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Vrije_Universiteit_Amsterdam"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"VU"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.vu.nl/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Vrije Universiteit Amsterdam"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::40e0928728ca1ea6ebb147ad307fc7db"}
{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Burgas_Free_University"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Бургаски свободен университет"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.bfu.bg/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Burgas Free University"},"country":{"classid":"BG","classname":"Bulgaria","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::28a99bd2330504b0dfb6c44192757bde"}
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Université_libre_de_Bruxelles"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.ulb.ac.be/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Université libre de Bruxelles"},"country":{"classid":"BE","classname":"Belgium","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-07-21","type":20,"id":"20|opendoar____::0e3d292f95a8f13fed04d7b3ac872b9f"}
{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Freie_Universitat_Berlin"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Freie Universitat Berlin"},"country":{"classid":"DE","classname":"Germany","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::5054b113a655361d929493a95d29e6f1"}
{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Freie_Universitat_Berlin"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Freie Universitat Berlin"},"country":{"classid":"DE","classname":"Germany","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::5054b113a655361d929493a95d29e6f1"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999879881"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"POLITECNICO DI MILANO"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.polimi.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"POLITECNICO DI MILANO"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::671b76de97f62c7ecf4a18cd5c5a24ce"}
{"dateoftransformation":"2018-12-15","originalId":["corda__h2020::999879881"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"PoliMi"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.polimi.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"POLITECNICO DI MILANO"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::671b76de97f62c7ecf4a18cd5c5a24ce"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999866689"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVPM"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univpm.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA POLITECNICA DELLE MARCHE"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::f050abb6c5aadea5488de784874ce4e6"}
{"dateoftransformation":"2018-12-15","originalId":["corda__h2020::999977754"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"POLITO"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.polito.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"POLITECNICO DI TORINO"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::c565531bf1c09673c8d4ca4228b9d307"}
{"dateoftransformation":"2018-12-15","originalId":["corda__h2020::999866689"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVPM"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univpm.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA POLITECNICA DELLE MARCHE"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::f050abb6c5aadea5488de784874ce4e6"}
{"dateoftransformation":"2018-12-15","originalId":["corda__h2020::999431159"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"POLIBA"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.poliba.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"POLITECNICO DI BARI"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::395a43af55ac010c4781a6c8645db8a6"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999431159"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"POLIBA"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"POLITECNICO DI BARI"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::395a43af55ac010c4781a6c8645db8a6"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999977754"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"POLITO"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.polito.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"POLITECNICO DI TORINO"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::c565531bf1c09673c8d4ca4228b9d307"}
{"dateoftransformation":"2019-05-19","originalId":["rcuk________::84803376-D657-41A3-A5DB-E1341282462E"],"collectedfrom":[{"value":"Research Councils UK","key":"10|openaire____::ab2d3310741ea80d3b8726f651502858"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Polytechnic University of Milan"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-11-04","type":20,"id":"20|rcuk________::a1aacefbbd7d0b6ebd2085941388b46d"}
{"dateoftransformation":"2019-05-19","originalId":["rcuk________::16A2AC44-FDDA-4753-A927-26136DDCDA6D"],"collectedfrom":[{"value":"Research Councils UK","key":"10|openaire____::ab2d3310741ea80d3b8726f651502858"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Polytechnic University of Turin"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-11-03","type":20,"id":"20|rcuk________::146b02f079957d07f10099a0b53e02c3"}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.pace.clustering;
import com.google.common.base.Joiner;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import java.util.*;
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
public KeywordsClustering(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(String s) {
List<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
List<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
final Collection<String> combinations = new LinkedHashSet<String>();
int size = 0;
for (String keyword: keywords){
for (String city: cities) {
combinations.add(keyword+"-"+city);
if (++size>params.getOrDefault("max", 2)) {
return combinations;
}
}
}
return combinations;
}
}

View File

@ -40,6 +40,8 @@ public abstract class AbstractPaceFunctions {
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
protected String concat(final List<String> l) {
@ -313,5 +315,41 @@ public abstract class AbstractPaceFunctions {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
public String normalizePid(String pid) {
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
}
//get the list of codes into the input string
public List<String> getCodes(String s1, Map<String, String> translationMap, int windowSize){
String s = cleanup(s1);
s = filterAllStopWords(s);
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
List<String> codes = new ArrayList<>();
if (tokens.size()<windowSize)
windowSize = tokens.size();
int length = windowSize;
while (length != 0) {
for (int i = 0; i<=tokens.size()-length; i++){
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
if (translationMap.containsKey(candidate)) {
codes.add(translationMap.get(candidate));
s = s.replace(candidate, "");
}
}
tokens = Arrays.asList(s.split(" "));
length-=1;
}
return codes;
}
}

View File

@ -52,9 +52,11 @@ public class PidMatch extends AbstractCondition {
return new ConditionEval(cond, a, b, result);
}
//lowercase + normalization of the pid before adding it to the set
private Set<String> toHashSet(List<Pid> pbl) {
return pbl.stream()
.map(pid -> pid.getType() + pid.getValue())
.map(pid -> pid.getType() + normalizePid(pid.getValue()))
.collect(Collectors.toCollection(HashSet::new));
}

View File

@ -43,8 +43,11 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
String ca = cleanup(a);
String cb = cleanup(b);
ca = removeStopwords(ca);
cb = removeStopwords(cb);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
//replace keywords with codes
String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue());
@ -80,16 +83,4 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
return d;
}
public String removeStopwords(String s) {
String normString = normalize(s);
normString = filterStopWords(normString, stopwordsIt);
normString = filterStopWords(normString, stopwordsEn);
normString = filterStopWords(normString, stopwordsDe);
normString = filterStopWords(normString, stopwordsFr);
normString = filterStopWords(normString, stopwordsPt);
normString = filterStopWords(normString, stopwordsEs);
return normString;
}
}

View File

@ -7915,7 +7915,7 @@ city::5134086;Rochester;Ga-sko-sa-ga;Gaskosago;ROC;Rocestera;Rocesteris;Rocestro
city::5136454;Schenectady;SCH;Schenectady;Shinetadi;Skanehtati;Skanéhtati;Skenektadi;Skenektadis;Skunektadi;seukinegteodi;si ke nei ke ta di;sknktdy nywywrk;sknyktady;sqnqtdy;sukenekutadi;Скенектади;Скенектаді;Скънектади;Шинетади;סקנקטדי;سكنيكتادي;سکنکتدی، نیویورک;سکینکٹڈی ، نیویارک;سکینیکٹیڈی، نیو یارک;スケネクタディ;斯克内克塔迪;스키넥터디;
city::5137849;Sheepshead Bay;;
city::5139568;Staten Island;Borough of Staten Island;Staten Island;
city::5140405;Syracuse;Bogardus Corners;Cossitts Corners;Gorad Sirakjus;Kah-ya-hoo-neh;Ken-tue-ho-ne;Milan;Na-ta-dunk;SYR;Siracusa;Sirak'jus;Sirakjus;Sirakjuz;Sirakuso;Sirakuz;South Salina;Sy-kuse;Syracusae;Syracuse;Syracuse i New York;Tu-na-ten-tonk;sayrakywz;shirakyusu;shirakyuzu;sileokyuseu;sirekyuja;syrakywz;syrakywz nyw yark;syrakywz nywywrk;syrqywz;xi la qiu ci;Горад Сіракюс;Сиракуз;Сиракьюс;Сиракюз;Сиракјус;Сіракюс;סירקיוז;سائراکیوز;سيراكيوز;سیراکیوز، نیو یارک;سیراکیوز، نیویورک;सिरॅक्युज;სირაკიუსი;シラキュース;シラキューズ;锡拉丘兹;시러큐스;
city::5140405;Syracuse;Bogardus Corners;Cossitts Corners;Gorad Sirakjus;Kah-ya-hoo-neh;Ken-tue-ho-ne;Na-ta-dunk;SYR;Siracusa;Sirak'jus;Sirakjus;Sirakjuz;Sirakuso;Sirakuz;South Salina;Sy-kuse;Syracusae;Syracuse;Syracuse i New York;Tu-na-ten-tonk;sayrakywz;shirakyusu;shirakyuzu;sileokyuseu;sirekyuja;syrakywz;syrakywz nyw yark;syrakywz nywywrk;syrqywz;xi la qiu ci;Горад Сіракюс;Сиракуз;Сиракьюс;Сиракюз;Сиракјус;Сіракюс;סירקיוז;سائراکیوز;سيراكيوز;سیراکیوز، نیو یارک;سیراکیوز، نیویورک;सिरॅक्युज;სირაკიუსი;シラキュース;シラキューズ;锡拉丘兹;시러큐스;
city::5142056;Utica;Fort Schuyler;Gorad Jutyka;Iotekha;Iotékha;Jutika;Nundadasis;Old Fort Schuyler;Tevadahahtodague;Twa-dah-ah-lo-dah-que;UCA;Unungagages;Utica;Utika;Yanundadasis;ZUA;awtyka;you ti ka;yutika;ywtyka nywywrk;ywtyqh;Јутика;Горад Ютыка;Утика;Ютика;Ютіка;יוטיקה;أوتيكا;یوتیکا، نیویورک;یوٹیکا، نیو یارک;ユーティカ;由提卡;
city::5143056;Wakefield;;
city::5143307;Washington Heights;Harlem Heights;Pen-a-bick;Washington Heights;washintonhaitsu;wosingteonhaicheu;ワシントンハイツ;워싱턴하이츠;

Can't render this file because it is too large.

View File

@ -332,7 +332,6 @@ male
malgrado
malissimo
mancanza
marche
me
medesimo
mediante

View File

@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί

1 key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
38 key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
39 key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
40 key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
41 key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university
42 key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
43 key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
44 key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί

View File

@ -117,4 +117,19 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(cf.apply(Lists.newArrayList(person(s))));
}
@Test
public void testKeywordsClustering() {
final ClusteringFunction cf = new KeywordsClustering(params);
final String s = "Polytechnic University of Turin";
System.out.println(s);
System.out.println(cf.apply(Lists.newArrayList(title(s))));
final String s1 = "POLITECNICO DI TORINO";
System.out.println(s1);
System.out.println(cf.apply(Lists.newArrayList(title(s1))));
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.pace.common;
import org.junit.Test;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;
public class PaceFunctionTest extends AbstractPaceFunctions {
@Test
public void normalizePidTest(){
assertEquals("identifier", normalizePid("IdentifIer"));
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
}
@Test
public void filterAllStopwordsTest(){
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
}
}

View File

@ -0,0 +1,7 @@
package eu.dnetlib.pace.condition;
import eu.dnetlib.pace.AbstractPaceTest;
public class ConditionTest extends AbstractPaceTest {
}

View File

@ -104,4 +104,14 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
}
@Test
public void testJaroWinklerNormalizedName7() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO");
System.out.println("result = " + result);
assertTrue(result> 0.9);
}
}