forked from D-Net/dnet-hadoop
minor changes
This commit is contained in:
parent
50b7a12b3f
commit
5499ca17c3
|
@ -25,7 +25,7 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
//list of combination to return as result
|
//list of combination to return as result
|
||||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||||
|
|
||||||
for (String keyword: keywordsToCodes(keywords)){
|
for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
|
||||||
for (String city: citiesToCodes(cities)) {
|
for (String city: citiesToCodes(cities)) {
|
||||||
combinations.add(keyword+"-"+city);
|
combinations.add(keyword+"-"+city);
|
||||||
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
||||||
|
|
|
@ -17,10 +17,7 @@ import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of common functions
|
* Set of common functions
|
||||||
|
@ -30,7 +27,6 @@ import java.util.stream.Stream;
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractPaceFunctions {
|
public abstract class AbstractPaceFunctions {
|
||||||
|
|
||||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
|
||||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||||
|
|
||||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||||
|
@ -243,10 +239,10 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public double keywordsCompare(Set<String> s1, Set<String> s2){
|
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
|
||||||
|
|
||||||
Set<String> k1 = keywordsToCodes(s1);
|
Set<String> k1 = keywordsToCodes(s1, translationMap);
|
||||||
Set<String> k2 = keywordsToCodes(s2);
|
Set<String> k2 = keywordsToCodes(s2, translationMap);
|
||||||
|
|
||||||
int longer = (k1.size()>k2.size())?k1.size():k2.size();
|
int longer = (k1.size()>k2.size())?k1.size():k2.size();
|
||||||
|
|
||||||
|
@ -278,7 +274,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<String> keywordsToCodes(Set<String> keywords) {
|
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||||
return toCodes(keywords, translationMap);
|
return toCodes(keywords, translationMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.config;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.condition.ConditionAlgo;
|
import eu.dnetlib.pace.condition.ConditionAlgo;
|
||||||
import eu.dnetlib.pace.model.ClusteringDef;
|
import eu.dnetlib.pace.model.ClusteringDef;
|
||||||
import eu.dnetlib.pace.model.CondDef;
|
import eu.dnetlib.pace.model.CondDef;
|
||||||
|
|
|
@ -46,7 +46,7 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
if (sameCity(cities1,cities2)) {
|
if (sameCity(cities1,cities2)) {
|
||||||
|
|
||||||
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||||
|
|
||||||
ca = removeKeywords(ca, keywords1);
|
ca = removeKeywords(ca, keywords1);
|
||||||
ca = removeKeywords(ca, cities1);
|
ca = removeKeywords(ca, cities1);
|
||||||
|
|
Loading…
Reference in New Issue