2018-10-02 10:37:54 +02:00
|
|
|
package eu.dnetlib.pace.clustering;
|
|
|
|
|
|
|
|
import java.util.Collection;
|
2018-10-25 15:15:40 +02:00
|
|
|
import java.util.HashMap;
|
2018-10-02 10:37:54 +02:00
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
|
|
|
|
import com.google.common.collect.Lists;
|
2019-10-08 14:53:52 +02:00
|
|
|
import eu.dnetlib.pace.config.Config;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
2018-10-24 12:09:41 +02:00
|
|
|
@ClusteringClass("ngrampairs")
|
2018-10-02 10:37:54 +02:00
|
|
|
public class NgramPairs extends Ngrams {
|
|
|
|
|
|
|
|
public NgramPairs(Map<String, Integer> params) {
|
|
|
|
super(params);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2019-10-08 14:53:52 +02:00
|
|
|
protected Collection<String> doApply(Config conf, String s) {
|
2018-10-02 10:37:54 +02:00
|
|
|
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
|
|
|
|
}
|
|
|
|
|
|
|
|
protected Collection<String> ngramPairs(final List<String> ngrams, int maxNgrams) {
|
|
|
|
Collection<String> res = Lists.newArrayList();
|
|
|
|
int j = 0;
|
|
|
|
for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) {
|
|
|
|
if (++j >= ngrams.size()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
res.add(ngrams.get(i) + ngrams.get(j));
|
|
|
|
//System.out.println("-- " + concatNgrams);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|