Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
4 changed files with 7 additions and 10 deletions
Showing only changes of commit 5499ca17c3 - Show all commits

View File

@ -25,7 +25,7 @@ public class KeywordsClustering extends AbstractClusteringFunction {
//list of combination to return as result //list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>(); final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword: keywordsToCodes(keywords)){ for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
for (String city: citiesToCodes(cities)) { for (String city: citiesToCodes(cities)) {
combinations.add(keyword+"-"+city); combinations.add(keyword+"-"+city);
if (combinations.size()>=params.getOrDefault("max", 2)) { if (combinations.size()>=params.getOrDefault("max", 2)) {

View File

@ -17,10 +17,7 @@ import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
/** /**
* Set of common functions * Set of common functions
@ -30,7 +27,6 @@ import java.util.stream.Stream;
*/ */
public abstract class AbstractPaceFunctions { public abstract class AbstractPaceFunctions {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
@ -243,10 +239,10 @@ public abstract class AbstractPaceFunctions {
} }
public double keywordsCompare(Set<String> s1, Set<String> s2){ public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
Set<String> k1 = keywordsToCodes(s1); Set<String> k1 = keywordsToCodes(s1, translationMap);
Set<String> k2 = keywordsToCodes(s2); Set<String> k2 = keywordsToCodes(s2, translationMap);
int longer = (k1.size()>k2.size())?k1.size():k2.size(); int longer = (k1.size()>k2.size())?k1.size():k2.size();
@ -278,7 +274,7 @@ public abstract class AbstractPaceFunctions {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
} }
public Set<String> keywordsToCodes(Set<String> keywords) { public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
return toCodes(keywords, translationMap); return toCodes(keywords, translationMap);
} }

View File

@ -2,6 +2,7 @@ package eu.dnetlib.pace.config;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.CondDef;

View File

@ -46,7 +46,7 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
if (sameCity(cities1,cities2)) { if (sameCity(cities1,cities2)) {
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeKeywords(ca, keywords1); ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1); ca = removeKeywords(ca, cities1);