2019-11-07 12:47:12 +01:00
|
|
|
package eu.dnetlib.pace;
|
|
|
|
|
|
|
|
import eu.dnetlib.Deduper;
|
2020-09-29 12:01:25 +02:00
|
|
|
import eu.dnetlib.jobs.SparkCreateDedupEntity;
|
|
|
|
import eu.dnetlib.jobs.SparkCreateMergeRels;
|
|
|
|
import eu.dnetlib.jobs.SparkCreateSimRels;
|
2019-11-07 12:47:12 +01:00
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
|
|
|
import eu.dnetlib.pace.utils.Utility;
|
2020-09-29 12:01:25 +02:00
|
|
|
import eu.dnetlib.support.ArgumentApplicationParser;
|
2020-12-04 15:41:31 +01:00
|
|
|
import org.apache.commons.io.FileUtils;
|
2019-11-07 12:47:12 +01:00
|
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
|
|
|
import org.apache.spark.sql.SparkSession;
|
2020-12-04 15:41:31 +01:00
|
|
|
import org.junit.jupiter.api.*;
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
import java.io.File;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.net.URISyntaxException;
|
|
|
|
import java.nio.file.Paths;
|
|
|
|
|
|
|
|
@Disabled
|
2019-11-20 10:45:00 +01:00
|
|
|
public class DedupLocalTest extends DedupTestUtils {
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
static SparkSession spark;
|
|
|
|
static DedupConfig config;
|
|
|
|
static JavaSparkContext context;
|
|
|
|
|
|
|
|
|
|
|
|
final String entitiesPath = Paths
|
|
|
|
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/orgs_dump").toURI())
|
|
|
|
.toFile()
|
|
|
|
.getAbsolutePath();
|
|
|
|
final static String workingPath = "/tmp/working_dir";
|
|
|
|
final static String numPartitions = "20";
|
|
|
|
|
|
|
|
final static String dedupConfPath = "/eu/dnetlib/pace/config/orgs.tree.conf.json";
|
|
|
|
final static String simRelsPath = workingPath + "/organization_simrel";
|
|
|
|
final static String mergeRelsPath = workingPath + "/organization_mergerel";
|
|
|
|
final static String dedupEntityPath = workingPath + "/organization_dedupentity";
|
2020-06-11 10:46:46 +02:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
public DedupLocalTest() throws URISyntaxException {
|
|
|
|
}
|
|
|
|
|
|
|
|
public static void cleanup() throws IOException {
|
|
|
|
//remove directories to clean workspace
|
|
|
|
FileUtils.deleteDirectory(new File(simRelsPath));
|
|
|
|
FileUtils.deleteDirectory(new File(mergeRelsPath));
|
|
|
|
FileUtils.deleteDirectory(new File(dedupEntityPath));
|
|
|
|
}
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
@BeforeAll
|
|
|
|
public static void setup() throws IOException {
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
cleanup();
|
|
|
|
|
|
|
|
config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/orgs.tree.conf.json", DedupLocalTest.class));
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
spark = SparkSession
|
2019-11-07 12:47:12 +01:00
|
|
|
.builder()
|
|
|
|
.appName("Deduplication")
|
|
|
|
.master("local[*]")
|
|
|
|
.getOrCreate();
|
2020-06-11 10:46:46 +02:00
|
|
|
context = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
2019-11-07 12:47:12 +01:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-09-29 12:01:25 +02:00
|
|
|
@Test
|
|
|
|
public void createSimRelTest() throws Exception {
|
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createSimRels_parameters.json", SparkCreateSimRels.class));
|
2020-09-29 12:01:25 +02:00
|
|
|
|
|
|
|
parser.parseArgument(
|
|
|
|
new String[] {
|
|
|
|
"-e", entitiesPath,
|
|
|
|
"-w", workingPath,
|
|
|
|
"-np", numPartitions,
|
|
|
|
"-dc", dedupConfPath
|
|
|
|
});
|
|
|
|
|
|
|
|
new SparkCreateSimRels(
|
|
|
|
parser,
|
|
|
|
spark
|
|
|
|
).run();
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void createMergeRelTest() throws Exception {
|
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createMergeRels_parameters.json", SparkCreateMergeRels.class));
|
2020-09-29 12:01:25 +02:00
|
|
|
|
|
|
|
parser.parseArgument(
|
|
|
|
new String[] {
|
|
|
|
"-e", entitiesPath,
|
|
|
|
"-w", workingPath,
|
|
|
|
"-np", numPartitions,
|
|
|
|
"-dc", dedupConfPath
|
|
|
|
});
|
|
|
|
|
|
|
|
new SparkCreateMergeRels(
|
|
|
|
parser,
|
|
|
|
spark
|
|
|
|
).run();
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void createDedupEntityTest() throws Exception {
|
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class));
|
2020-09-29 12:01:25 +02:00
|
|
|
|
|
|
|
parser.parseArgument(
|
|
|
|
new String[] {
|
|
|
|
"-e", entitiesPath,
|
|
|
|
"-w", workingPath,
|
|
|
|
"-np", numPartitions,
|
|
|
|
"-dc", dedupConfPath
|
|
|
|
});
|
|
|
|
|
|
|
|
new SparkCreateDedupEntity(
|
|
|
|
parser,
|
|
|
|
spark
|
|
|
|
).run();
|
|
|
|
}
|
|
|
|
|
2019-11-07 12:47:12 +01:00
|
|
|
@Test
|
2020-12-04 15:41:31 +01:00
|
|
|
public void deduplicationTest() throws IOException {
|
2020-06-11 10:46:46 +02:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
long before_simrels = System.currentTimeMillis();
|
2020-06-11 10:46:46 +02:00
|
|
|
Deduper.createSimRels(
|
|
|
|
config,
|
|
|
|
spark,
|
|
|
|
entitiesPath,
|
2020-12-04 15:41:31 +01:00
|
|
|
simRelsPath
|
2020-06-11 10:46:46 +02:00
|
|
|
);
|
2020-12-04 15:41:31 +01:00
|
|
|
long simrels_time = System.currentTimeMillis() - before_simrels;
|
|
|
|
|
|
|
|
long simrels_number = spark.read().load(simRelsPath).count();
|
2020-06-11 10:46:46 +02:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
long before_mergerels = System.currentTimeMillis();
|
2020-06-11 10:46:46 +02:00
|
|
|
Deduper.createMergeRels(
|
|
|
|
config,
|
|
|
|
entitiesPath,
|
2020-12-04 15:41:31 +01:00
|
|
|
mergeRelsPath,
|
|
|
|
simRelsPath,
|
2020-06-11 10:46:46 +02:00
|
|
|
spark
|
|
|
|
);
|
2020-12-04 15:41:31 +01:00
|
|
|
long mergerels_time = System.currentTimeMillis() - before_mergerels;
|
|
|
|
|
|
|
|
long mergerels_number = spark.read().load(mergeRelsPath).count();
|
2020-06-11 10:46:46 +02:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
long before_dedupentity = System.currentTimeMillis();
|
2020-06-11 10:46:46 +02:00
|
|
|
Deduper.createDedupEntity(
|
|
|
|
config,
|
2020-12-04 15:41:31 +01:00
|
|
|
mergeRelsPath,
|
2020-06-11 10:46:46 +02:00
|
|
|
entitiesPath,
|
|
|
|
spark,
|
2020-12-04 15:41:31 +01:00
|
|
|
dedupEntityPath
|
2020-06-11 10:46:46 +02:00
|
|
|
);
|
2020-12-04 15:41:31 +01:00
|
|
|
long dedupentity_time = System.currentTimeMillis() - before_dedupentity;
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
long dedupentity_number = context.textFile(dedupEntityPath).count();
|
|
|
|
|
|
|
|
System.out.println("Number of simrels : " + simrels_number);
|
|
|
|
System.out.println("Number of mergerels : " + mergerels_number);
|
|
|
|
System.out.println("Number of dedupentities : " + dedupentity_number);
|
|
|
|
System.out.println("Total time for simrels creation : " + simrels_time);
|
|
|
|
System.out.println("Total time for mergerels creation : " + mergerels_time);
|
|
|
|
System.out.println("Total time for dedupentity creation : " + dedupentity_time);
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
cleanup();
|
|
|
|
}
|
2019-11-07 12:47:12 +01:00
|
|
|
}
|