addition of a fixSpecial function to address the problem with special character in organization names, addition of new terms in translation maps

This commit is contained in:
miconis 2019-08-06 17:06:05 +02:00
parent 85070ce3fe
commit f0b4c4cbd4
10 changed files with 96 additions and 36 deletions

View File

@ -1,7 +1,10 @@
package eu.dnetlib;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
@ -47,4 +50,14 @@ public class Block implements Serializable {
public int elements(){
return elements.size();
}
@Override
public String toString(){
ObjectMapper mapper = new ObjectMapper();
try {
return mapper.writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Failed to create Json: ", e);
}
}
}

View File

@ -89,7 +89,9 @@ public class SparkLocalTest {
System.out.println(cc);
});
connectedComponents.foreach(cc -> {
cc.getDocs().stream().forEach(d -> System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname")));
cc.getDocs().stream().forEach(d -> {
System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname"));
});
});
//print nondeduped
nonDeduplicated.foreach(cc -> {

View File

@ -1,12 +1,12 @@
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIOK"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6183d331a1920dd81b8c10620a8b3a8a"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIVEL"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::8f65fd4e764086db897cc648e9cbbaed"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Social Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::c69cffc4997b54bb2eb5ca6aebcda18b"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Health Services Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::5b72dc608480f3d5569a7bfe3cbdaf07"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"SCP"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::6b7b927a3ae25f1639a6ef27b35021b5"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Catalysis Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6af340f03c44041737859d3e1354d1fe"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek van de Gezondheidszorg"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::69ab0f5ed7da9d961355cb4eb24b8613"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek in de Katalyse"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::267cf3ce23903e0a8403653019ce8187"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::755737ed505484ea374062762ef05ef6"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::755737ed505484ea374062762ef05ef6"}
{"dateoftransformation":"2018-09-13","originalId":["snsf________::The_Netherlands_Institute_of_Health_Services_Research_NIVEL"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute of Health Services Research NIVEL"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-22","type":20,"id":"20|snsf________::10653be4e9c170181486aa9782346d81"}
{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"}
{"dateoftransformation":"2019-07-22","originalId":["corda_______::999895789"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIFI"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI FIRENZE"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::19137683d6d3cd4dda5054af05081b6f"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999895789"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIFI"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI FIRENZE"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::19137683d6d3cd4dda5054af05081b6f"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Firenze_University_Press"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Firenze University Press"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Firenze University Press"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::b29ae16abb2343c6ffc152666b24ea95"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Università degli Studi di Firenze"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::115715507c87ade107909750c44fbee5"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"University of Florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::60e21d5264c51c62f154afa6166ba21b"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"University of Florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::a6d1d3c2eb368cb2ab1ff293c625d90e"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Université de florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::c8b8860f04bf3c755f4632395ea27375"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Universität Florenz"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::ff05feef920762cbef5de7640dcb718e"}
{"originalId":["https://academic.microsoft.com/#/detail/45084792"],"pid":[{"qualifier":{"classid":"urn","classname":"urn","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"http://en.wikipedia.org/wiki/University_of_Florence"},{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"},{"qualifier":{"classid":"mag_id","classname":"Microsoft Academic Graph Identifier","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"https://academic.microsoft.com/#/detail/45084792"}],"collectedfrom":[{"value":"Microsoft Academic Graph","key":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"}],"organization":{"metadata":{"websiteurl":{"value":"http://www.unifi.it/"},"legalname":{"value":"University of Florence"}}},"type":20,"id":"20|microsoft___::adecd59d8ff7f5aaedac013fa0f54ffe"}
{"dateoftransformation":"2018-09-13","originalId":["openaire____::issn20381026::Università degli Studi di Firenze"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-06-02","type":20,"id":"20|openaire____::55a8725b9d9a9a67615018901270de4b"}
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Università_degli_Studi_di_Firenze"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::4f194641be797be5e5eb11227e962145"}
{"dateoftransformation":"2018-09-13","originalId":["snsf________::Università_degli_Studi_di_Firenze"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-23","type":20,"id":"20|snsf________::4f194641be797be5e5eb11227e962145"}

View File

@ -1,16 +1,15 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
public KeywordsClustering(Map<String, Integer> params) {
super(params);
}
@ -36,4 +35,18 @@ public class KeywordsClustering extends AbstractClusteringFunction {
return combinations;
}
@Override
public Collection<String> apply(List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(this::doApply)
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -6,7 +6,6 @@ import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
@ -29,7 +28,6 @@ import java.util.stream.Stream;
*/
public abstract class AbstractPaceFunctions {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
@ -44,7 +42,10 @@ public abstract class AbstractPaceFunctions {
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
private static final String special_from = "İə";
private static final String special_to = "Ie";
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
@ -55,7 +56,8 @@ public abstract class AbstractPaceFunctions {
}
protected String cleanup(final String s) {
final String s0 = s.toLowerCase();
final String ss = fixSpecial(s); //TODO is there something implemented to replace strange symbols with latin letters?
final String s0 = ss.toLowerCase();
final String s1 = fixAliases(s0);
final String s2 = nfd(s1);
final String s3 = s2.replaceAll("&ndash;", " ");
@ -98,6 +100,16 @@ public abstract class AbstractPaceFunctions {
return s.replaceAll("\\D", "");
}
//sometimes the toLowerCase() produces error, this is meant to prevent them by replacing special character before the lowercase function
protected static String fixSpecial(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(special_from, ch);
sb.append(i >= 0 ? special_to.charAt(i) : ch);
}
return sb.toString();
}
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
@ -154,7 +166,7 @@ public abstract class AbstractPaceFunctions {
return sb.toString().trim();
}
protected String filterAllStopWords(String s) {
public String filterAllStopWords(String s) {
s = filterStopWords(s, stopwords_en);
s = filterStopWords(s, stopwords_de);
@ -193,12 +205,12 @@ public abstract class AbstractPaceFunctions {
public static Map<String, String> loadMapFromClasspath(final String classpath) {
final Map<String, String> m = new HashMap<>();
try {
for (final String s: IOUtils.readLines(JaroWinklerNormalizedName.class.getResourceAsStream(classpath))) {
for (final String s: IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
//string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
for (String key: line){
m.put(fixAliases(key).toLowerCase(),value);
for (int i=1; i<line.length;i++){
m.put(line[i].toLowerCase(),value);
}
}
} catch (final Throwable e){
@ -287,9 +299,7 @@ public abstract class AbstractPaceFunctions {
//get the list of codes into the input string
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
String s = cleanup(s1);
s = filterAllStopWords(s);
String s = s1;
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));

View File

@ -7178,7 +7178,7 @@ city::743882;Kastamonu;Castamena;Castamon;Castamoni;Castamonu;Castamuni;KFS;Kast
city::743952;Kars;Cars;Gorad Kars;KSY;Kapc;Kars;Karsa;Karsas;Khuars;Kuars;Qars;Qers;Vanand;ka er si;kaleuseu;kar s;karasa;kars;karusu;qars;Καρς;Горад Карс;Карс;Къарс;Хъарс;Ҟарс;Կարս;קארס;قارص;قەرس;كارس;کارس;ਕਾਰਸ;การ์ส;ყარსი;カルス;卡爾斯;카르스;
city::744562;Karabuk;Karabiukas;Karabjuk;Karabuek;Karabuk;Karabük;Karampouk;Qerebuk;ka la bi ke;ka la bi ke sheng;kalabwikeu;karabuka;karabwk;karabyukku;Καραμπούκ;Карабук;Карабюк;Карабүк;Карабӱк;قره‌بوک;قرہ بوک;كارابوك;کارابوک;ਕਾਰਾਬੁਕ;ყარაბუქი;カラビュック;卡拉比克;卡拉比克省;카라뷔크;
city::745028;Izmit;Astacus;Cocaeli;Ismid;Ismit;Isnimid;Izmid;Izmit;Kodja-Eli;Koja-Ili;Nicomedia;Nicomedie;Nicomédie;Nikomedeia;Nikomedia;izumitto;yi zi mi te;İzmit;Измит;イズミット;伊兹密特;
city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;Istanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불;
city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;İstanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불;
city::745169;Inegol;Inegeul;Inegoel;Inegol;İnegöl;
city::746666;Goelcuek;Geulzuk;Goelcuek;Gölcük;
city::746881;Giresun;Cerasus;Choerades;Gireson;Giresun;Giresunas;Kerasounta;Kerassunde;Kerasun;Kerasunda;Kerasunt;Kiresun;OGU;Pharnacia;ghyrswn;gilesun;giresun;giresuni;grysn;gryswn;ji lei song;Κερασούντα;Гиресун;Ґіресун;Կերասուն;غيرسون;گره‌سون;گریسن;گریسون;გირესუნი;ギレスン;吉雷松;기레순;

Can't render this file because it is too large.

View File

@ -445,7 +445,6 @@ posto
potrebbe
preferibilmente
presa
press
prima
primo
principalmente

View File

@ -1,6 +1,6 @@
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza;
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology;
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
@ -46,8 +46,8 @@ key::45;institution;istituzione;institution;институциональный;i
key::46;division;divisione;division;отделение;divisie;τμήμα
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
key::48;promotion;promozione;продвижение;proothisis;forderung
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;technical;texniki;teknik
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
@ -100,4 +100,5 @@ key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neur
key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia;
key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
key::102;informatics;informatica;informática;informática;informatica;
key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
key:104;commerce;ticaret;ticarət;commercio;trade;handel;comercio;
1 key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
2 key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
3 key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza; key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
4 key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
5 key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
6 key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
38 key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
39 key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
40 key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
41 key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology; key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
42 key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
43 key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
44 key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
46 key::46;division;divisione;division;отделение;divisie;τμήμα
47 key::47;committee;comitato;comité;комитет;commissie;επιτροπή
48 key::48;promotion;promozione;продвижение;proothisis;forderung
49 key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline; key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline
50 key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii; key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;technical;texniki;teknik
51 key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
52 key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
53 key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
100 key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia;
101 key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
102 key::102;informatics;informatica;informática;informática;informatica;
103 key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
104 key:104;commerce;ticaret;ticarət;commercio;trade;handel;comercio;

View File

@ -141,6 +141,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println("s4 = " + s4);
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5);
System.out.println(cf.apply(Lists.newArrayList(title(s5))));
}
}

View File

@ -130,4 +130,22 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
System.out.println("result = " + result);
}
@Test
public void testJaroWinklerNormalizedName9() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti");
System.out.println("result = " + result);
}
@Test
public void testJaroWinklerNormalizedName10(){
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence");
System.out.println("result = " + result);
}
}