addition of a fixSpecial function to address the problem with special character in organization names, addition of new terms in translation maps
This commit is contained in:
parent
85070ce3fe
commit
f0b4c4cbd4
|
@ -1,7 +1,10 @@
|
|||
package eu.dnetlib;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
@ -47,4 +50,14 @@ public class Block implements Serializable {
|
|||
public int elements(){
|
||||
return elements.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
try {
|
||||
return mapper.writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Failed to create Json: ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -89,7 +89,9 @@ public class SparkLocalTest {
|
|||
System.out.println(cc);
|
||||
});
|
||||
connectedComponents.foreach(cc -> {
|
||||
cc.getDocs().stream().forEach(d -> System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname")));
|
||||
cc.getDocs().stream().forEach(d -> {
|
||||
System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname"));
|
||||
});
|
||||
});
|
||||
//print nondeduped
|
||||
nonDeduplicated.foreach(cc -> {
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIOK"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6183d331a1920dd81b8c10620a8b3a8a"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIVEL"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::8f65fd4e764086db897cc648e9cbbaed"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Social Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::c69cffc4997b54bb2eb5ca6aebcda18b"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Health Services Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::5b72dc608480f3d5569a7bfe3cbdaf07"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"SCP"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::6b7b927a3ae25f1639a6ef27b35021b5"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Catalysis Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6af340f03c44041737859d3e1354d1fe"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek van de Gezondheidszorg"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::69ab0f5ed7da9d961355cb4eb24b8613"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek in de Katalyse"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::267cf3ce23903e0a8403653019ce8187"}
|
||||
{"dateoftransformation":"2018-11-20","originalId":["corda_______::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::755737ed505484ea374062762ef05ef6"}
|
||||
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::755737ed505484ea374062762ef05ef6"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["snsf________::The_Netherlands_Institute_of_Health_Services_Research_NIVEL"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute of Health Services Research NIVEL"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-22","type":20,"id":"20|snsf________::10653be4e9c170181486aa9782346d81"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"}
|
||||
{"dateoftransformation":"2019-07-22","originalId":["corda_______::999895789"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIFI"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI FIRENZE"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::19137683d6d3cd4dda5054af05081b6f"}
|
||||
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999895789"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIFI"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI FIRENZE"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::19137683d6d3cd4dda5054af05081b6f"}
|
||||
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Firenze_University_Press"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Firenze University Press"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Firenze University Press"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::b29ae16abb2343c6ffc152666b24ea95"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Università degli Studi di Firenze"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::115715507c87ade107909750c44fbee5"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"University of Florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::60e21d5264c51c62f154afa6166ba21b"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"University of Florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::a6d1d3c2eb368cb2ab1ff293c625d90e"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Université de florence"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::c8b8860f04bf3c755f4632395ea27375"}
|
||||
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"University of Florence"},"websiteurl":{"value":"http://www.unifi.it/"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Universität Florenz"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"}],"type":20,"id":"20|grid________::ff05feef920762cbef5de7640dcb718e"}
|
||||
{"originalId":["https://academic.microsoft.com/#/detail/45084792"],"pid":[{"qualifier":{"classid":"urn","classname":"urn","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"http://en.wikipedia.org/wiki/University_of_Florence"},{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.8404.8"},{"qualifier":{"classid":"mag_id","classname":"Microsoft Academic Graph Identifier","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"https://academic.microsoft.com/#/detail/45084792"}],"collectedfrom":[{"value":"Microsoft Academic Graph","key":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"}],"organization":{"metadata":{"websiteurl":{"value":"http://www.unifi.it/"},"legalname":{"value":"University of Florence"}}},"type":20,"id":"20|microsoft___::adecd59d8ff7f5aaedac013fa0f54ffe"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["openaire____::issn20381026::Università degli Studi di Firenze"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-06-02","type":20,"id":"20|openaire____::55a8725b9d9a9a67615018901270de4b"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Università_degli_Studi_di_Firenze"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unifi.it/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::4f194641be797be5e5eb11227e962145"}
|
||||
{"dateoftransformation":"2018-09-13","originalId":["snsf________::Università_degli_Studi_di_Firenze"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Firenze"},"country":{"classid":"IT","classname":"Italy","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-23","type":20,"id":"20|snsf________::4f194641be797be5e5eb11227e962145"}
|
|
@ -1,16 +1,15 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("keywordsclustering")
|
||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||
|
||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||
|
||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
public KeywordsClustering(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
@ -36,4 +35,18 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
|
||||
return combinations;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(this::doApply)
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
}
|
|
@ -6,7 +6,6 @@ import com.google.common.collect.Iterables;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.FieldListImpl;
|
||||
|
@ -29,7 +28,6 @@ import java.util.stream.Stream;
|
|||
*/
|
||||
public abstract class AbstractPaceFunctions {
|
||||
|
||||
|
||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
|
@ -44,7 +42,10 @@ public abstract class AbstractPaceFunctions {
|
|||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
private static final String special_from = "İə";
|
||||
private static final String special_to = "Ie";
|
||||
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
|
@ -55,7 +56,8 @@ public abstract class AbstractPaceFunctions {
|
|||
}
|
||||
|
||||
protected String cleanup(final String s) {
|
||||
final String s0 = s.toLowerCase();
|
||||
final String ss = fixSpecial(s); //TODO is there something implemented to replace strange symbols with latin letters?
|
||||
final String s0 = ss.toLowerCase();
|
||||
final String s1 = fixAliases(s0);
|
||||
final String s2 = nfd(s1);
|
||||
final String s3 = s2.replaceAll("–", " ");
|
||||
|
@ -98,6 +100,16 @@ public abstract class AbstractPaceFunctions {
|
|||
return s.replaceAll("\\D", "");
|
||||
}
|
||||
|
||||
//sometimes the toLowerCase() produces error, this is meant to prevent them by replacing special character before the lowercase function
|
||||
protected static String fixSpecial(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
final int i = StringUtils.indexOf(special_from, ch);
|
||||
sb.append(i >= 0 ? special_to.charAt(i) : ch);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
|
@ -154,7 +166,7 @@ public abstract class AbstractPaceFunctions {
|
|||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
protected String filterAllStopWords(String s) {
|
||||
public String filterAllStopWords(String s) {
|
||||
|
||||
s = filterStopWords(s, stopwords_en);
|
||||
s = filterStopWords(s, stopwords_de);
|
||||
|
@ -193,12 +205,12 @@ public abstract class AbstractPaceFunctions {
|
|||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s: IOUtils.readLines(JaroWinklerNormalizedName.class.getResourceAsStream(classpath))) {
|
||||
for (final String s: IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
|
||||
//string is like this: code;word1;word2;word3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (String key: line){
|
||||
m.put(fixAliases(key).toLowerCase(),value);
|
||||
for (int i=1; i<line.length;i++){
|
||||
m.put(line[i].toLowerCase(),value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e){
|
||||
|
@ -287,9 +299,7 @@ public abstract class AbstractPaceFunctions {
|
|||
//get the list of codes into the input string
|
||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
|
||||
|
||||
String s = cleanup(s1);
|
||||
|
||||
s = filterAllStopWords(s);
|
||||
String s = s1;
|
||||
|
||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||
|
||||
|
|
|
@ -7178,7 +7178,7 @@ city::743882;Kastamonu;Castamena;Castamon;Castamoni;Castamonu;Castamuni;KFS;Kast
|
|||
city::743952;Kars;Cars;Gorad Kars;KSY;Kapc;Kars;Karsa;Karsas;Khuars;Kuars;Qars;Qers;Vanand;ka er si;kaleuseu;kar s;karasa;kars;karusu;qars;Καρς;Горад Карс;Карс;Къарс;Хъарс;Ҟарс;Կարս;קארס;قارص;قەرس;كارس;کارس;ਕਾਰਸ;การ์ส;ყარსი;カルス;卡爾斯;카르스;
|
||||
city::744562;Karabuk;Karabiukas;Karabjuk;Karabuek;Karabuk;Karabük;Karampouk;Qerebuk;ka la bi ke;ka la bi ke sheng;kalabwikeu;karabuka;karabwk;karabyukku;Καραμπούκ;Карабук;Карабюк;Карабүк;Карабӱк;قرهبوک;قرہ بوک;كارابوك;کارابوک;ਕਾਰਾਬੁਕ;ყარაბუქი;カラビュック;卡拉比克;卡拉比克省;카라뷔크;
|
||||
city::745028;Izmit;Astacus;Cocaeli;Ismid;Ismit;Isnimid;Izmid;Izmit;Kodja-Eli;Koja-Ili;Nicomedia;Nicomedie;Nicomédie;Nikomedeia;Nikomedia;izumitto;yi zi mi te;İzmit;Измит;イズミット;伊兹密特;
|
||||
city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;Istanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불;
|
||||
city::745044;Istanbul;Bizanc;Bizánc;Byzance;Byzantion;Byzantium;Byzanz;Constantinoble;Constantinopla;Constantinople;Constantinopolen;Constantinopoli;Constantinopolis;Costantinopoli;Estambul;IST;Istamboul;Istambul;Istambuł;Istampoul;İstanbul;Istanbúl;Isztambul;Konstantinapoly;Konstantinopel;Konstantinopolo;Konstantinoupole;Konstantinoupoli;Konstantinoupolis;Konstantinápoly;Kustantiniyah;Micklagard;Micklagård;Mikligardur;Mikligarður;Stamboul;Stambul;Stambula;Stambuł;Tsarigrad;Vizantija (Vizantija);Vyzantio;astnbwl;bijantium;byuzantion;byzntywn;iseutanbul;istambula;isutanburu;stin Poli [stimˈboli];yi si tan bu er;İstanbul;Βυζάντιο;Βυζαντιο;Ισταμπουλ;Ισταμπούλ;Κωνσταντινουπολη;Κωνσταντινούπολη;Κωνσταντινούπολις;στην Πόλι [stimˈboli];Византија (Vizantija);Истанбул;Стамбул;ביזנטיון;اسطنبول;इस्तांबुल;イスタンブール;ビュザンティオン;伊斯坦布尔;비잔티움;이스탄불;
|
||||
city::745169;Inegol;Inegeul;Inegoel;Inegol;İnegöl;
|
||||
city::746666;Goelcuek;Geulzuk;Goelcuek;Gölcük;
|
||||
city::746881;Giresun;Cerasus;Choerades;Gireson;Giresun;Giresunas;Kerasounta;Kerassunde;Kerasun;Kerasunda;Kerasunt;Kiresun;OGU;Pharnacia;ghyrswn;gilesun;giresun;giresuni;grysn;gryswn;ji lei song;Κερασούντα;Гиресун;Ґіресун;Կերասուն;غيرسون;گرهسون;گریسن;گریسون;გირესუნი;ギレスン;吉雷松;기레순;
|
||||
|
|
Can't render this file because it is too large.
|
|
@ -445,7 +445,6 @@ posto
|
|||
potrebbe
|
||||
preferibilmente
|
||||
presa
|
||||
press
|
||||
prima
|
||||
primo
|
||||
principalmente
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
|
||||
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
|
||||
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
||||
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza;
|
||||
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
|
||||
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
||||
key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο
|
||||
key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas
|
||||
|
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
|
|||
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
||||
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
||||
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology;
|
||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
|
||||
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
||||
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
||||
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
||||
|
@ -46,8 +46,8 @@ key::45;institution;istituzione;institution;институциональный;i
|
|||
key::46;division;divisione;division;отделение;divisie;τμήμα
|
||||
key::47;committee;comitato;comité;комитет;commissie;επιτροπή
|
||||
key::48;promotion;promozione;продвижение;proothisis;forderung
|
||||
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline;
|
||||
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;
|
||||
key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline
|
||||
key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline;technologii;technical;texniki;teknik
|
||||
key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik;
|
||||
key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri;
|
||||
key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus;
|
||||
|
@ -100,4 +100,5 @@ key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neur
|
|||
key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia;
|
||||
key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia;
|
||||
key::102;informatics;informatica;informática;informática;informatica;
|
||||
key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
|
||||
key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association
|
||||
key:104;commerce;ticaret;ticarət;commercio;trade;handel;comercio;
|
|
|
@ -141,6 +141,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
System.out.println("s4 = " + s4);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
|
||||
|
||||
final String s5 = "İstanbul Ticarət Universiteti";
|
||||
System.out.println("s5 = " + s5);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s5))));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -130,4 +130,22 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName9() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti");
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName10(){
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence");
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue