2023-07-06 10:28:53 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
package eu.dnetlib.pace.clustering;
|
|
|
|
|
|
|
|
import java.util.Collection;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Set;
|
|
|
|
import java.util.StringTokenizer;
|
|
|
|
|
|
|
|
import com.google.common.collect.Sets;
|
2023-07-06 10:28:53 +02:00
|
|
|
|
2019-10-08 14:53:52 +02:00
|
|
|
import eu.dnetlib.pace.config.Config;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
2018-10-24 12:09:41 +02:00
|
|
|
@ClusteringClass("acronyms")
|
2018-10-02 10:37:54 +02:00
|
|
|
public class Acronyms extends AbstractClusteringFunction {
|
|
|
|
|
|
|
|
public Acronyms(Map<String, Integer> params) {
|
|
|
|
super(params);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2019-10-08 14:53:52 +02:00
|
|
|
protected Collection<String> doApply(Config conf, String s) {
|
2018-10-02 10:37:54 +02:00
|
|
|
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
|
|
|
}
|
2023-07-06 10:28:53 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
|
2023-07-06 10:28:53 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
final Set<String> acronyms = Sets.newLinkedHashSet();
|
2023-07-06 10:28:53 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
for (int i = 0; i < maxAcronyms; i++) {
|
2023-07-06 10:28:53 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
final StringTokenizer st = new StringTokenizer(s);
|
|
|
|
final StringBuilder sb = new StringBuilder();
|
2023-07-06 10:28:53 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
while (st.hasMoreTokens()) {
|
|
|
|
final String token = st.nextToken();
|
|
|
|
if (sb.length() > maxLen) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (token.length() > 1 && i < token.length()) {
|
|
|
|
sb.append(token.charAt(i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
String acronym = sb.toString();
|
|
|
|
if (acronym.length() > minLen) {
|
|
|
|
acronyms.add(acronym);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return acronyms;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|