2018-10-02 10:37:54 +02:00
|
|
|
package eu.dnetlib.pace.clustering;
|
|
|
|
|
|
|
|
import com.google.common.collect.Lists;
|
|
|
|
import com.google.common.collect.Maps;
|
|
|
|
import eu.dnetlib.pace.AbstractPaceTest;
|
2018-11-05 17:22:59 +01:00
|
|
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
2019-10-08 14:53:52 +02:00
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
2018-10-02 10:37:54 +02:00
|
|
|
import org.junit.Before;
|
|
|
|
import org.junit.Test;
|
|
|
|
|
2019-10-08 14:53:52 +02:00
|
|
|
import java.util.Map;
|
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
public class ClusteringFunctionTest extends AbstractPaceTest {
|
|
|
|
|
|
|
|
private Map<String, Integer> params;
|
2019-10-08 14:53:52 +02:00
|
|
|
DedupConfig conf;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
@Before
|
|
|
|
public void setUp() throws Exception {
|
|
|
|
params = Maps.newHashMap();
|
2019-11-07 12:47:12 +01:00
|
|
|
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ClusteringFunctionTest.class));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
2018-11-05 17:22:59 +01:00
|
|
|
@Test
|
|
|
|
public void testUrlClustering() {
|
|
|
|
|
|
|
|
final ClusteringFunction urlClustering = new UrlClustering(params);
|
|
|
|
|
|
|
|
final String s = "http://www.test.it/path/to/resource";
|
|
|
|
System.out.println(s);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s))));
|
2018-11-05 17:22:59 +01:00
|
|
|
}
|
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
@Test
|
|
|
|
public void testNgram() {
|
|
|
|
params.put("ngramLen", 3);
|
|
|
|
params.put("max", 8);
|
|
|
|
params.put("maxPerToken", 2);
|
|
|
|
params.put("minNgramLen", 1);
|
|
|
|
|
|
|
|
final ClusteringFunction ngram = new Ngrams(params);
|
|
|
|
|
|
|
|
final String s = "Search for the Standard Model Higgs Boson";
|
|
|
|
System.out.println(s);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(ngram.apply(conf, Lists.newArrayList(title(s))));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testNgramPairs() {
|
|
|
|
params.put("ngramLen", 3);
|
2019-08-06 12:09:34 +02:00
|
|
|
params.put("max", 1);
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
final ClusteringFunction np = new NgramPairs(params);
|
|
|
|
|
|
|
|
final String s = "Search for the Standard Model Higgs Boson";
|
|
|
|
System.out.println(s);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(np.apply(conf, Lists.newArrayList(title(s))));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testSortedNgramPairs() {
|
|
|
|
params.put("ngramLen", 3);
|
|
|
|
params.put("max", 1);
|
|
|
|
|
|
|
|
final ClusteringFunction np = new SortedNgramPairs(params);
|
|
|
|
|
|
|
|
final String s1 = "University of Pisa";
|
|
|
|
System.out.println(s1);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(np.apply(conf, Lists.newArrayList(title(s1))));
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
final String s2 = "Pisa University";
|
|
|
|
System.out.println(s2);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(np.apply(conf, Lists.newArrayList(title(s2))));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testAcronym() {
|
|
|
|
params.put("max", 4);
|
|
|
|
params.put("minLen", 1);
|
|
|
|
params.put("maxLen", 3);
|
|
|
|
|
|
|
|
final ClusteringFunction acro = new Acronyms(params);
|
|
|
|
|
|
|
|
final String s = "Search for the Standard Model Higgs Boson";
|
|
|
|
System.out.println(s);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(acro.apply(conf, Lists.newArrayList(title(s))));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testSuffixPrefix() {
|
|
|
|
params.put("len", 3);
|
|
|
|
params.put("max", 4);
|
|
|
|
|
|
|
|
final ClusteringFunction sp = new SuffixPrefix(params);
|
|
|
|
|
|
|
|
final String s = "Search for the Standard Model Higgs Boson";
|
|
|
|
System.out.println(s);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testFieldValue() {
|
|
|
|
|
|
|
|
params.put("randomLength", 5);
|
|
|
|
|
|
|
|
final ClusteringFunction sp = new SpaceTrimmingFieldValue(params);
|
|
|
|
|
|
|
|
final String s = "Search for the Standard Model Higgs Boson";
|
|
|
|
System.out.println(s);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testPersonClustering2() {
|
|
|
|
final ClusteringFunction cf = new PersonClustering(params);
|
|
|
|
|
|
|
|
final String s = readFromClasspath("gt.author.json");
|
|
|
|
System.out.println(s);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(cf.apply(conf, Lists.newArrayList(person(s))));
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|
|
|
|
|
2019-07-08 09:44:02 +02:00
|
|
|
@Test
|
|
|
|
public void testKeywordsClustering() {
|
|
|
|
|
|
|
|
final ClusteringFunction cf = new KeywordsClustering(params);
|
|
|
|
final String s = "Polytechnic University of Turin";
|
|
|
|
System.out.println(s);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
2019-07-08 09:44:02 +02:00
|
|
|
|
|
|
|
final String s1 = "POLITECNICO DI TORINO";
|
|
|
|
System.out.println(s1);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
2019-07-08 09:44:02 +02:00
|
|
|
|
2019-07-08 11:01:49 +02:00
|
|
|
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
|
|
|
System.out.println("s2 = " + s2);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s2))));
|
2019-07-08 11:01:49 +02:00
|
|
|
|
|
|
|
final String s3 = "universita universita milano milano";
|
|
|
|
System.out.println("s3 = " + s3);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s3))));
|
2019-07-08 09:44:02 +02:00
|
|
|
|
2019-07-19 17:10:29 +02:00
|
|
|
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
|
|
|
System.out.println("s4 = " + s4);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s4))));
|
2019-07-19 17:10:29 +02:00
|
|
|
|
2019-08-06 17:06:05 +02:00
|
|
|
final String s5 = "İstanbul Ticarət Universiteti";
|
|
|
|
System.out.println("s5 = " + s5);
|
2019-10-08 14:53:52 +02:00
|
|
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
|
2019-08-06 17:06:05 +02:00
|
|
|
|
2019-07-08 09:44:02 +02:00
|
|
|
}
|
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
}
|