2018-10-02 10:37:54 +02:00
|
|
|
package eu.dnetlib.pace.clustering;
|
|
|
|
|
|
|
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
2019-10-08 14:53:52 +02:00
|
|
|
import eu.dnetlib.pace.config.Config;
|
2018-10-02 10:37:54 +02:00
|
|
|
import eu.dnetlib.pace.model.Field;
|
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
|
|
2019-03-21 14:27:27 +01:00
|
|
|
import java.util.Collection;
|
|
|
|
import java.util.HashSet;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
|
|
|
|
|
|
|
protected Map<String, Integer> params;
|
|
|
|
|
|
|
|
public AbstractClusteringFunction(final Map<String, Integer> params) {
|
|
|
|
this.params = params;
|
|
|
|
}
|
2018-10-24 12:09:41 +02:00
|
|
|
|
2019-10-08 14:53:52 +02:00
|
|
|
protected abstract Collection<String> doApply(Config conf, String s);
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
@Override
|
2019-10-08 14:53:52 +02:00
|
|
|
public Collection<String> apply(Config conf, List<Field> fields) {
|
2018-10-02 10:37:54 +02:00
|
|
|
return fields.stream().filter(f -> !f.isEmpty())
|
|
|
|
.map(Field::stringValue)
|
|
|
|
.map(this::normalize)
|
2019-03-21 14:27:27 +01:00
|
|
|
.map(s -> filterAllStopWords(s))
|
2019-10-08 14:53:52 +02:00
|
|
|
.map(s -> doApply(conf, s))
|
2018-10-02 10:37:54 +02:00
|
|
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
|
|
|
.flatMap(c -> c.stream())
|
|
|
|
.filter(StringUtils::isNotBlank)
|
|
|
|
.collect(Collectors.toCollection(HashSet::new));
|
|
|
|
}
|
|
|
|
|
|
|
|
public Map<String, Integer> getParams() {
|
|
|
|
return params;
|
|
|
|
}
|
|
|
|
|
|
|
|
protected Integer param(String name) {
|
|
|
|
return params.get(name);
|
|
|
|
}
|
|
|
|
}
|