diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 1680ab0..769ecf5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -25,7 +25,7 @@ public class KeywordsClustering extends AbstractClusteringFunction { //list of combination to return as result final Collection combinations = new LinkedHashSet(); - for (String keyword: keywordsToCodes(keywords)){ + for (String keyword: keywordsToCodes(keywords, conf.translationMap())){ for (String city: citiesToCodes(cities)) { combinations.add(keyword+"-"+city); if (combinations.size()>=params.getOrDefault("max", 2)) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 3050293..23ff7ac 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -17,10 +17,7 @@ import java.io.IOException; import java.io.StringWriter; import java.text.Normalizer; import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; -import java.util.stream.Stream; /** * Set of common functions @@ -30,7 +27,6 @@ import java.util.stream.Stream; */ public abstract class AbstractPaceFunctions { - private static Map translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv"); private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); @@ -243,10 +239,10 @@ public abstract class AbstractPaceFunctions { } - public double keywordsCompare(Set s1, Set s2){ + public double keywordsCompare(Set s1, Set s2, Map translationMap){ - Set k1 = keywordsToCodes(s1); - Set k2 = keywordsToCodes(s2); + Set k1 = keywordsToCodes(s1, translationMap); + Set k2 = keywordsToCodes(s2, translationMap); int longer = (k1.size()>k2.size())?k1.size():k2.size(); @@ -278,7 +274,7 @@ public abstract class AbstractPaceFunctions { return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); } - public Set keywordsToCodes(Set keywords) { + public Set keywordsToCodes(Set keywords, Map translationMap) { return toCodes(keywords, translationMap); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 939f6c0..56995bb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.config; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.CondDef; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java index 64ab74c..889ebab 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java @@ -46,7 +46,7 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { if (sameCity(cities1,cities2)) { - if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { + if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) { ca = removeKeywords(ca, keywords1); ca = removeKeywords(ca, cities1);