2018-10-02 10:37:54 +02:00
package eu.dnetlib.pace.clustering ;
import com.google.common.collect.Lists ;
import com.google.common.collect.Maps ;
import eu.dnetlib.pace.AbstractPaceTest ;
2018-11-05 17:22:59 +01:00
import eu.dnetlib.pace.common.AbstractPaceFunctions ;
2019-10-08 14:53:52 +02:00
import eu.dnetlib.pace.config.DedupConfig ;
2020-12-04 15:41:31 +01:00
import org.junit.jupiter.api.* ;
2018-10-02 10:37:54 +02:00
2019-10-08 14:53:52 +02:00
import java.util.Map ;
2018-10-02 10:37:54 +02:00
public class ClusteringFunctionTest extends AbstractPaceTest {
2020-12-04 15:41:31 +01:00
private static Map < String , Integer > params ;
private static DedupConfig conf ;
2018-10-02 10:37:54 +02:00
2020-12-04 15:41:31 +01:00
@BeforeAll
public static void setUp ( ) throws Exception {
2018-10-02 10:37:54 +02:00
params = Maps . newHashMap ( ) ;
2019-12-17 09:16:26 +01:00
conf = DedupConfig . load ( AbstractPaceFunctions . readFromClasspath ( " /eu/dnetlib/pace/config/organization.current.conf.json " , ClusteringFunctionTest . class ) ) ;
2018-10-02 10:37:54 +02:00
}
2018-11-05 17:22:59 +01:00
@Test
public void testUrlClustering ( ) {
final ClusteringFunction urlClustering = new UrlClustering ( params ) ;
final String s = " http://www.test.it/path/to/resource " ;
System . out . println ( s ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( urlClustering . apply ( conf , Lists . newArrayList ( url ( s ) ) ) ) ;
2018-11-05 17:22:59 +01:00
}
2018-10-02 10:37:54 +02:00
@Test
public void testNgram ( ) {
params . put ( " ngramLen " , 3 ) ;
params . put ( " max " , 8 ) ;
params . put ( " maxPerToken " , 2 ) ;
params . put ( " minNgramLen " , 1 ) ;
final ClusteringFunction ngram = new Ngrams ( params ) ;
final String s = " Search for the Standard Model Higgs Boson " ;
System . out . println ( s ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( ngram . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
2018-10-02 10:37:54 +02:00
}
@Test
public void testNgramPairs ( ) {
params . put ( " ngramLen " , 3 ) ;
2021-09-13 14:53:19 +02:00
params . put ( " max " , 2 ) ;
2018-10-02 10:37:54 +02:00
final ClusteringFunction np = new NgramPairs ( params ) ;
final String s = " Search for the Standard Model Higgs Boson " ;
System . out . println ( s ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( np . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
2018-10-02 10:37:54 +02:00
}
@Test
public void testSortedNgramPairs ( ) {
params . put ( " ngramLen " , 3 ) ;
2021-09-13 14:53:19 +02:00
params . put ( " max " , 2 ) ;
2018-10-02 10:37:54 +02:00
final ClusteringFunction np = new SortedNgramPairs ( params ) ;
final String s1 = " University of Pisa " ;
System . out . println ( s1 ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( np . apply ( conf , Lists . newArrayList ( title ( s1 ) ) ) ) ;
2018-10-02 10:37:54 +02:00
final String s2 = " Pisa University " ;
System . out . println ( s2 ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( np . apply ( conf , Lists . newArrayList ( title ( s2 ) ) ) ) ;
2021-09-13 14:53:19 +02:00
final String s3 = " Parco Tecnologico Agroalimentare Umbria " ;
System . out . println ( s3 ) ;
System . out . println ( np . apply ( conf , Lists . newArrayList ( title ( s3 ) ) ) ) ;
2018-10-02 10:37:54 +02:00
}
@Test
public void testAcronym ( ) {
params . put ( " max " , 4 ) ;
params . put ( " minLen " , 1 ) ;
params . put ( " maxLen " , 3 ) ;
final ClusteringFunction acro = new Acronyms ( params ) ;
final String s = " Search for the Standard Model Higgs Boson " ;
System . out . println ( s ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( acro . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
2018-10-02 10:37:54 +02:00
}
@Test
public void testSuffixPrefix ( ) {
params . put ( " len " , 3 ) ;
params . put ( " max " , 4 ) ;
final ClusteringFunction sp = new SuffixPrefix ( params ) ;
final String s = " Search for the Standard Model Higgs Boson " ;
System . out . println ( s ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
2018-10-02 10:37:54 +02:00
}
2020-07-02 17:04:17 +02:00
@Test
public void testWordsSuffixPrefix ( ) {
params . put ( " len " , 3 ) ;
params . put ( " max " , 4 ) ;
final ClusteringFunction sp = new WordsSuffixPrefix ( params ) ;
final String s = " Search for the Standard Model Higgs Boson " ;
System . out . println ( s ) ;
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
}
2020-07-16 18:57:55 +02:00
@Test
public void testWordsStatsSuffixPrefix ( ) {
params . put ( " mod " , 10 ) ;
final ClusteringFunction sp = new WordsStatsSuffixPrefixChain ( params ) ;
String s = " Search for the Standard Model Higgs Boson " ;
System . out . println ( s ) ;
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
s = " A Physical Education Teacher Is Like...: Examining Turkish Students Perceptions of Physical Education Teachers Through Metaphor Analysis " ;
System . out . println ( s ) ;
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
s = " Structure of a Eukaryotic Nonribosomal Peptide Synthetase Adenylation Domain That Activates a Large Hydroxamate Amino Acid in Siderophore Biosynthesis " ;
System . out . println ( s ) ;
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
s = " Performance Evaluation " ;
System . out . println ( s ) ;
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
2021-09-13 14:53:19 +02:00
s = " JRC Open Power Plants Database (JRC-PPDB-OPEN) " ;
System . out . println ( s ) ;
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
s = " JRC Open Power Plants Database " ;
System . out . println ( s ) ;
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
2020-07-16 18:57:55 +02:00
}
2018-10-02 10:37:54 +02:00
@Test
public void testFieldValue ( ) {
params . put ( " randomLength " , 5 ) ;
final ClusteringFunction sp = new SpaceTrimmingFieldValue ( params ) ;
final String s = " Search for the Standard Model Higgs Boson " ;
System . out . println ( s ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( sp . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
2018-10-02 10:37:54 +02:00
}
2019-07-08 09:44:02 +02:00
@Test
public void testKeywordsClustering ( ) {
final ClusteringFunction cf = new KeywordsClustering ( params ) ;
final String s = " Polytechnic University of Turin " ;
System . out . println ( s ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( cf . apply ( conf , Lists . newArrayList ( title ( s ) ) ) ) ;
2019-07-08 09:44:02 +02:00
final String s1 = " POLITECNICO DI TORINO " ;
System . out . println ( s1 ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( cf . apply ( conf , Lists . newArrayList ( title ( s1 ) ) ) ) ;
2019-07-08 09:44:02 +02:00
2019-07-08 11:01:49 +02:00
final String s2 = " Universita farmaceutica culturale di milano bergamo " ;
System . out . println ( " s2 = " + s2 ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( cf . apply ( conf , Lists . newArrayList ( title ( s2 ) ) ) ) ;
2019-07-08 11:01:49 +02:00
final String s3 = " universita universita milano milano " ;
System . out . println ( " s3 = " + s3 ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( cf . apply ( conf , Lists . newArrayList ( title ( s3 ) ) ) ) ;
2019-07-08 09:44:02 +02:00
2019-07-19 17:10:29 +02:00
final String s4 = " Politechniki Warszawskiej (Warsaw University of Technology) " ;
System . out . println ( " s4 = " + s4 ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( cf . apply ( conf , Lists . newArrayList ( title ( s4 ) ) ) ) ;
2019-07-19 17:10:29 +02:00
2019-08-06 17:06:05 +02:00
final String s5 = " İstanbul Ticarət Universiteti " ;
System . out . println ( " s5 = " + s5 ) ;
2019-10-08 14:53:52 +02:00
System . out . println ( cf . apply ( conf , Lists . newArrayList ( title ( s5 ) ) ) ) ;
2019-08-06 17:06:05 +02:00
2019-07-08 09:44:02 +02:00
}
2018-10-02 10:37:54 +02:00
}