implemented new function for clustering

This commit is contained in:
miconis 2020-07-02 17:04:17 +02:00
parent 411d1cc24f
commit f933fd33e0
2 changed files with 55 additions and 0 deletions

View File

@ -0,0 +1,42 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("wordssuffixprefix")
public class WordsSuffixPrefix extends AbstractClusteringFunction {
public WordsSuffixPrefix(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max"));
}
private Collection<String> suffixPrefix(String s, int len, int max) {
final int words = s.split(" ").length;
final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0;
while (++i < s.length() && bigrams.size() < max) {
int j = s.indexOf(" ", i);
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
if (j - len > 0) {
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
if (bigram.length() >= 4) {
bigrams.add(words+bigram);
}
}
}
return bigrams;
}
}

View File

@ -98,6 +98,19 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
}
@Test
public void testWordsSuffixPrefix() {
params.put("len", 3);
params.put("max", 4);
final ClusteringFunction sp = new WordsSuffixPrefix(params);
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
}
@Test
public void testFieldValue() {