From f933fd33e081745259d12ab2b89d181483f25462 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 2 Jul 2020 17:04:17 +0200 Subject: [PATCH] implemented new function for clustering --- .../pace/clustering/WordsSuffixPrefix.java | 42 +++++++++++++++++++ .../clustering/ClusteringFunctionTest.java | 13 ++++++ 2 files changed, 55 insertions(+) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java new file mode 100644 index 000000000..6086ac0a8 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java @@ -0,0 +1,42 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; + +@ClusteringClass("wordssuffixprefix") +public class WordsSuffixPrefix extends AbstractClusteringFunction { + + public WordsSuffixPrefix(Map params) { + super(params); + } + + @Override + protected Collection doApply(Config conf, String s) { + return suffixPrefix(s, param("len"), param("max")); + } + + private Collection suffixPrefix(String s, int len, int max) { + + final int words = s.split(" ").length; + final Set bigrams = Sets.newLinkedHashSet(); + int i = 0; + while (++i < s.length() && bigrams.size() < max) { + int j = s.indexOf(" ", i); + + int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); + + if (j - len > 0) { + String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); + if (bigram.length() >= 4) { + bigrams.add(words+bigram); + } + } + } + return bigrams; + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 2bccdd5cb..832ba4bce 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -98,6 +98,19 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } + @Test + public void testWordsSuffixPrefix() { + + params.put("len", 3); + params.put("max", 4); + + final ClusteringFunction sp = new WordsSuffixPrefix(params); + + final String s = "Search for the Standard Model Higgs Boson"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + } + @Test public void testFieldValue() {