dnet-dedup/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java

39 lines
1.3 KiB
Java

package eu.dnetlib.pace.clustering;
import com.google.common.base.Joiner;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import java.util.*;
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
public KeywordsClustering(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(String s) {
List<String> keywords = getCodes(s, translationMap, params.getOrDefault("windowSize", 4));
List<String> cities = getCodes(s, cityMap, params.getOrDefault("windowSize", 4));
final Collection<String> combinations = new LinkedHashSet<String>();
int size = 0;
for (String keyword: keywords){
for (String city: cities) {
combinations.add(keyword+"-"+city);
if (++size>params.getOrDefault("max", 2)) {
return combinations;
}
}
}
return combinations;
}
}