2023-07-06 10:28:53 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
package eu.dnetlib.pace.clustering;
|
|
|
|
|
|
|
|
import java.util.Set;
|
|
|
|
|
2020-02-10 12:38:40 +01:00
|
|
|
import org.apache.commons.lang3.StringUtils;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
|
|
|
|
|
|
|
public class NGramUtils extends AbstractPaceFunctions {
|
2023-07-18 11:38:56 +02:00
|
|
|
static private final NGramUtils NGRAMUTILS = new NGramUtils();
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
private static final int SIZE = 100;
|
|
|
|
|
2023-07-18 11:38:56 +02:00
|
|
|
private static final Set<String> stopwords = AbstractPaceFunctions
|
2023-07-06 10:28:53 +02:00
|
|
|
.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
public static String cleanupForOrdering(String s) {
|
2023-07-18 11:38:56 +02:00
|
|
|
return (NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE))
|
2023-07-06 10:28:53 +02:00
|
|
|
.substring(0, SIZE)
|
|
|
|
.replaceAll(" ", "");
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|