dnet-hadoop/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java

21 lines
617 B
Java

package eu.dnetlib.pace.clustering;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
public class NGramUtils extends AbstractPaceFunctions {
private static final int SIZE = 100;
private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
public static String cleanupForOrdering(String s) {
NGramUtils utils = new NGramUtils();
return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
}
}