implemented new function for clustering
This commit is contained in:
parent
b3ec4194da
commit
33eadb7c9c
|
@ -28,12 +28,12 @@ public class DedupLocalTest extends DedupTestUtils {
|
||||||
DedupConfig config;
|
DedupConfig config;
|
||||||
JavaSparkContext context;
|
JavaSparkContext context;
|
||||||
|
|
||||||
final String entitiesPath = "/Users/miconis/IdeaProjects/DnetDedup/dnet-dedup/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/examples/organization";
|
final String entitiesPath = "/Users/miconis/Desktop/publications_to_fix.json";
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() {
|
public void setup() {
|
||||||
|
|
||||||
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", DedupLocalTest.class));
|
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/publication.current.conf.json", DedupLocalTest.class));
|
||||||
|
|
||||||
spark = SparkSession
|
spark = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
|
@ -51,20 +51,20 @@ public class DedupLocalTest extends DedupTestUtils {
|
||||||
config,
|
config,
|
||||||
spark,
|
spark,
|
||||||
entitiesPath,
|
entitiesPath,
|
||||||
"/tmp/deduptest/organization_simrel"
|
"/tmp/deduptest/publication_simrel"
|
||||||
);
|
);
|
||||||
|
|
||||||
Deduper.createMergeRels(
|
Deduper.createMergeRels(
|
||||||
config,
|
config,
|
||||||
entitiesPath,
|
entitiesPath,
|
||||||
"/tmp/deduptest/organization_mergerel",
|
"/tmp/deduptest/publication_mergerel",
|
||||||
"/tmp/deduptest/organization_simrel",
|
"/tmp/deduptest/publication_simrel",
|
||||||
spark
|
spark
|
||||||
);
|
);
|
||||||
|
|
||||||
Deduper.createDedupEntity(
|
Deduper.createDedupEntity(
|
||||||
config,
|
config,
|
||||||
"/tmp/deduptest/organization_mergerel",
|
"/tmp/deduptest/publication_mergerel",
|
||||||
entitiesPath,
|
entitiesPath,
|
||||||
spark,
|
spark,
|
||||||
"/tmp/deduptest/dedupentity"
|
"/tmp/deduptest/dedupentity"
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
|
@ClusteringClass("wordssuffixprefix")
|
||||||
|
public class WordsSuffixPrefix extends AbstractClusteringFunction {
|
||||||
|
|
||||||
|
public WordsSuffixPrefix(Map<String, Integer> params) {
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Collection<String> doApply(Config conf, String s) {
|
||||||
|
return suffixPrefix(s, param("len"), param("max"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Collection<String> suffixPrefix(String s, int len, int max) {
|
||||||
|
|
||||||
|
final int words = s.split(" ").length;
|
||||||
|
final Set<String> bigrams = Sets.newLinkedHashSet();
|
||||||
|
int i = 0;
|
||||||
|
while (++i < s.length() && bigrams.size() < max) {
|
||||||
|
int j = s.indexOf(" ", i);
|
||||||
|
|
||||||
|
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
|
||||||
|
|
||||||
|
if (j - len > 0) {
|
||||||
|
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
|
||||||
|
if (bigram.length() >= 4) {
|
||||||
|
bigrams.add(words+bigram);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bigrams;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -98,6 +98,19 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWordsSuffixPrefix() {
|
||||||
|
|
||||||
|
params.put("len", 3);
|
||||||
|
params.put("max", 4);
|
||||||
|
|
||||||
|
final ClusteringFunction sp = new WordsSuffixPrefix(params);
|
||||||
|
|
||||||
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
|
System.out.println(s);
|
||||||
|
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFieldValue() {
|
public void testFieldValue() {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue