From 07ab904d609379c93598b2b16f501964decc16dd Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 16 Jul 2020 18:57:55 +0200 Subject: [PATCH] implementation of the clustering function for the suffixprefix chain --- .../WordsStatsSuffixPrefixChain.java | 90 +++++++++++++++++++ .../clustering/ClusteringFunctionTest.java | 24 +++++ 2 files changed, 114 insertions(+) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java new file mode 100644 index 000000000..6fa2668fa --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java @@ -0,0 +1,90 @@ +package eu.dnetlib.pace.clustering; + +import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; + +import java.util.*; +import java.util.stream.Collectors; + +@ClusteringClass("wordsStatsSuffixPrefixChain") +public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction { + + public WordsStatsSuffixPrefixChain(Map params) { + super(params); + } + + @Override + protected Collection doApply(Config conf, String s) { + return suffixPrefixChain(s, param("mod")); + } + + private Collection suffixPrefixChain(String s, int mod) { + + //create the list of words from the string (remove short words) + List wordsList = + Arrays.stream(s.split(" ")) + .filter(si -> si.length() > 3) + .collect(Collectors.toList()); + + final int words = wordsList.size(); + final int letters = s.length(); + + //create the prefix: number of words + number of letters/mod + String prefix = words + "-" + letters/mod + "-"; + + return doSuffixPrefixChain(wordsList, prefix); + + } + + private Collection doSuffixPrefixChain(List wordsList, String prefix) { + + Set set = Sets.newLinkedHashSet(); + switch(wordsList.size()){ + case 0: + case 1: + break; + case 2: + set.add( + prefix + + suffix(wordsList.get(0), 3) + + prefix(wordsList.get(1), 3) + ); + + set.add( + prefix + + prefix(wordsList.get(0), 3) + + suffix(wordsList.get(1), 3) + ); + + break; + default: + set.add( + prefix + + suffix(wordsList.get(0), 3) + + prefix(wordsList.get(1), 3) + + suffix(wordsList.get(2), 3) + ); + + set.add( + prefix + + prefix(wordsList.get(0), 3) + + suffix(wordsList.get(1), 3) + + prefix(wordsList.get(2), 3) + ); + break; + } + + return set; + + } + + + private String suffix(String s, int len) { + return s.substring(s.length()-len); + } + + private String prefix(String s, int len) { + return s.substring(0, len); + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 832ba4bce..396604cea 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -111,6 +111,30 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } + @Test + public void testWordsStatsSuffixPrefix() { + params.put("mod", 10); + + final ClusteringFunction sp = new WordsStatsSuffixPrefixChain(params); + + String s = "Search for the Standard Model Higgs Boson"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + + s = "A Physical Education Teacher Is Like...: Examining Turkish Students Perceptions of Physical Education Teachers Through Metaphor Analysis"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + + s = "Structure of a Eukaryotic Nonribosomal Peptide Synthetase Adenylation Domain That Activates a Large Hydroxamate Amino Acid in Siderophore Biosynthesis"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + + s = "Performance Evaluation"; + System.out.println(s); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); + + } + @Test public void testFieldValue() {