2019-11-07 12:47:12 +01:00
|
|
|
package eu.dnetlib.pace;
|
|
|
|
|
|
|
|
import eu.dnetlib.Deduper;
|
2020-09-29 12:01:25 +02:00
|
|
|
import eu.dnetlib.jobs.SparkCreateDedupEntity;
|
|
|
|
import eu.dnetlib.jobs.SparkCreateMergeRels;
|
|
|
|
import eu.dnetlib.jobs.SparkCreateSimRels;
|
2019-11-07 12:47:12 +01:00
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
2021-09-13 14:53:19 +02:00
|
|
|
import eu.dnetlib.pace.model.MapDocument;
|
|
|
|
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
|
|
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
2019-11-07 12:47:12 +01:00
|
|
|
import eu.dnetlib.pace.utils.Utility;
|
2020-09-29 12:01:25 +02:00
|
|
|
import eu.dnetlib.support.ArgumentApplicationParser;
|
2021-09-13 14:53:19 +02:00
|
|
|
import eu.dnetlib.support.Block;
|
|
|
|
import eu.dnetlib.support.Relation;
|
|
|
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
2020-12-04 15:41:31 +01:00
|
|
|
import org.apache.commons.io.FileUtils;
|
2021-09-13 14:53:19 +02:00
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
|
import org.apache.spark.api.java.JavaPairRDD;
|
|
|
|
import org.apache.spark.api.java.JavaRDD;
|
2019-11-07 12:47:12 +01:00
|
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
2022-01-13 11:58:28 +01:00
|
|
|
import org.apache.spark.api.java.function.ForeachFunction;
|
2021-09-13 14:53:19 +02:00
|
|
|
import org.apache.spark.api.java.function.PairFunction;
|
2022-01-13 11:58:28 +01:00
|
|
|
import org.apache.spark.sql.Dataset;
|
|
|
|
import org.apache.spark.sql.Encoders;
|
|
|
|
import org.apache.spark.sql.Row;
|
2019-11-07 12:47:12 +01:00
|
|
|
import org.apache.spark.sql.SparkSession;
|
2020-12-04 15:41:31 +01:00
|
|
|
import org.junit.jupiter.api.*;
|
2021-09-13 14:53:19 +02:00
|
|
|
import org.junit.jupiter.api.extension.ExtendWith;
|
|
|
|
import org.mockito.junit.jupiter.MockitoExtension;
|
|
|
|
import scala.Tuple2;
|
|
|
|
|
2021-09-17 10:33:29 +02:00
|
|
|
import java.awt.*;
|
2021-09-13 14:53:19 +02:00
|
|
|
import java.awt.event.WindowAdapter;
|
|
|
|
import java.awt.event.WindowEvent;
|
|
|
|
import java.io.BufferedReader;
|
2020-12-04 15:41:31 +01:00
|
|
|
import java.io.File;
|
|
|
|
import java.io.IOException;
|
2021-09-13 14:53:19 +02:00
|
|
|
import java.io.InputStreamReader;
|
2020-12-04 15:41:31 +01:00
|
|
|
import java.net.URISyntaxException;
|
|
|
|
import java.nio.file.Paths;
|
2021-09-13 14:53:19 +02:00
|
|
|
import java.util.*;
|
2021-09-17 10:33:29 +02:00
|
|
|
import java.util.List;
|
2021-09-13 14:53:19 +02:00
|
|
|
import java.util.stream.Collectors;
|
2020-12-04 15:41:31 +01:00
|
|
|
|
2021-09-13 14:53:19 +02:00
|
|
|
@ExtendWith(MockitoExtension.class)
|
|
|
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
|
|
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
2019-11-20 10:45:00 +01:00
|
|
|
public class DedupLocalTest extends DedupTestUtils {
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
static SparkSession spark;
|
|
|
|
static DedupConfig config;
|
|
|
|
static JavaSparkContext context;
|
|
|
|
|
|
|
|
final String entitiesPath = Paths
|
2021-09-13 14:53:19 +02:00
|
|
|
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/organization").toURI())
|
2020-12-04 15:41:31 +01:00
|
|
|
.toFile()
|
|
|
|
.getAbsolutePath();
|
2021-09-13 14:53:19 +02:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
final static String workingPath = "/tmp/working_dir";
|
|
|
|
final static String numPartitions = "20";
|
|
|
|
|
2021-09-13 14:53:19 +02:00
|
|
|
final String dedupConfPath = Paths
|
|
|
|
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/orgs.tree.conf.json").toURI())
|
|
|
|
.toFile()
|
|
|
|
.getAbsolutePath();
|
|
|
|
|
|
|
|
final static String simRelsPath = workingPath + "/simrels";
|
|
|
|
final static String mergeRelsPath = workingPath + "/mergerels";
|
|
|
|
final static String dedupEntityPath = workingPath + "/dedupentities";
|
2020-06-11 10:46:46 +02:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
public DedupLocalTest() throws URISyntaxException {
|
|
|
|
}
|
|
|
|
|
|
|
|
public static void cleanup() throws IOException {
|
|
|
|
//remove directories to clean workspace
|
|
|
|
FileUtils.deleteDirectory(new File(simRelsPath));
|
|
|
|
FileUtils.deleteDirectory(new File(mergeRelsPath));
|
|
|
|
FileUtils.deleteDirectory(new File(dedupEntityPath));
|
|
|
|
}
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
@BeforeAll
|
2021-09-13 14:53:19 +02:00
|
|
|
public void setup() throws IOException {
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
cleanup();
|
|
|
|
|
2021-09-13 14:53:19 +02:00
|
|
|
config = DedupConfig.load(readFileFromHDFS(dedupConfPath));
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2020-06-11 10:46:46 +02:00
|
|
|
spark = SparkSession
|
2019-11-07 12:47:12 +01:00
|
|
|
.builder()
|
|
|
|
.appName("Deduplication")
|
|
|
|
.master("local[*]")
|
|
|
|
.getOrCreate();
|
2020-06-11 10:46:46 +02:00
|
|
|
context = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
2019-11-07 12:47:12 +01:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2021-09-13 14:53:19 +02:00
|
|
|
@AfterAll
|
|
|
|
public static void finalCleanUp() throws IOException {
|
|
|
|
cleanup();
|
|
|
|
}
|
|
|
|
|
|
|
|
protected static String readFileFromHDFS(String filePath) throws IOException {
|
|
|
|
|
|
|
|
Path path=new Path(filePath);
|
|
|
|
FileSystem fs = FileSystem.get(new Configuration());
|
|
|
|
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
|
|
|
|
try {
|
|
|
|
return String.join("", br.lines().collect(Collectors.toList()));
|
|
|
|
} finally {
|
|
|
|
br.close();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-29 12:01:25 +02:00
|
|
|
@Test
|
2021-09-13 14:53:19 +02:00
|
|
|
@Order(1)
|
2020-09-29 12:01:25 +02:00
|
|
|
public void createSimRelTest() throws Exception {
|
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createSimRels_parameters.json", SparkCreateSimRels.class));
|
2020-09-29 12:01:25 +02:00
|
|
|
|
|
|
|
parser.parseArgument(
|
|
|
|
new String[] {
|
|
|
|
"-e", entitiesPath,
|
|
|
|
"-w", workingPath,
|
|
|
|
"-np", numPartitions,
|
2021-09-13 14:53:19 +02:00
|
|
|
"-dc", dedupConfPath,
|
|
|
|
"-ut", "true"
|
2020-09-29 12:01:25 +02:00
|
|
|
});
|
|
|
|
|
|
|
|
new SparkCreateSimRels(
|
|
|
|
parser,
|
|
|
|
spark
|
|
|
|
).run();
|
2021-09-13 14:53:19 +02:00
|
|
|
|
|
|
|
long simrels_number = spark.read().load(simRelsPath).count();
|
|
|
|
System.out.println("simrels_number = " + simrels_number);
|
2020-09-29 12:01:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
2021-09-13 14:53:19 +02:00
|
|
|
@Order(2)
|
2020-09-29 12:01:25 +02:00
|
|
|
public void createMergeRelTest() throws Exception {
|
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createMergeRels_parameters.json", SparkCreateMergeRels.class));
|
2020-09-29 12:01:25 +02:00
|
|
|
|
|
|
|
parser.parseArgument(
|
|
|
|
new String[] {
|
|
|
|
"-e", entitiesPath,
|
|
|
|
"-w", workingPath,
|
|
|
|
"-np", numPartitions,
|
|
|
|
"-dc", dedupConfPath
|
|
|
|
});
|
|
|
|
|
|
|
|
new SparkCreateMergeRels(
|
|
|
|
parser,
|
|
|
|
spark
|
|
|
|
).run();
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
2021-09-13 14:53:19 +02:00
|
|
|
@Order(3)
|
2020-09-29 12:01:25 +02:00
|
|
|
public void createDedupEntityTest() throws Exception {
|
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class));
|
2020-09-29 12:01:25 +02:00
|
|
|
|
|
|
|
parser.parseArgument(
|
|
|
|
new String[] {
|
|
|
|
"-e", entitiesPath,
|
|
|
|
"-w", workingPath,
|
|
|
|
"-np", numPartitions,
|
|
|
|
"-dc", dedupConfPath
|
|
|
|
});
|
|
|
|
|
|
|
|
new SparkCreateDedupEntity(
|
|
|
|
parser,
|
|
|
|
spark
|
|
|
|
).run();
|
|
|
|
}
|
|
|
|
|
2021-09-13 14:53:19 +02:00
|
|
|
@Test //full deduplication workflow test
|
|
|
|
@Ignore
|
2022-01-13 11:58:28 +01:00
|
|
|
public void deduplicationTest() throws Exception {
|
2020-06-11 10:46:46 +02:00
|
|
|
|
2021-10-22 11:21:09 +02:00
|
|
|
//custom parameters for this test
|
2022-01-13 11:58:28 +01:00
|
|
|
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
|
|
|
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json").toURI()).toFile().getAbsolutePath()
|
|
|
|
));
|
|
|
|
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath();
|
2021-10-22 11:21:09 +02:00
|
|
|
String simRelsPath = workingPath + "/simrels";
|
|
|
|
String mergeRelsPath = workingPath + "/mergerels";
|
|
|
|
String outputPath = workingPath + "/dedup";
|
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
long before_simrels = System.currentTimeMillis();
|
2020-06-11 10:46:46 +02:00
|
|
|
Deduper.createSimRels(
|
2021-10-22 11:21:09 +02:00
|
|
|
dedupConfig,
|
2020-06-11 10:46:46 +02:00
|
|
|
spark,
|
2021-10-22 11:21:09 +02:00
|
|
|
inputPath,
|
2021-09-13 14:53:19 +02:00
|
|
|
simRelsPath,
|
2022-01-13 11:58:28 +01:00
|
|
|
true,
|
|
|
|
false
|
2020-06-11 10:46:46 +02:00
|
|
|
);
|
2020-12-04 15:41:31 +01:00
|
|
|
long simrels_time = System.currentTimeMillis() - before_simrels;
|
|
|
|
|
|
|
|
long simrels_number = spark.read().load(simRelsPath).count();
|
2020-06-11 10:46:46 +02:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
long before_mergerels = System.currentTimeMillis();
|
2020-06-11 10:46:46 +02:00
|
|
|
Deduper.createMergeRels(
|
2021-10-22 11:21:09 +02:00
|
|
|
dedupConfig,
|
|
|
|
inputPath,
|
2020-12-04 15:41:31 +01:00
|
|
|
mergeRelsPath,
|
|
|
|
simRelsPath,
|
2020-06-11 10:46:46 +02:00
|
|
|
spark
|
|
|
|
);
|
2020-12-04 15:41:31 +01:00
|
|
|
long mergerels_time = System.currentTimeMillis() - before_mergerels;
|
|
|
|
|
|
|
|
long mergerels_number = spark.read().load(mergeRelsPath).count();
|
2020-06-11 10:46:46 +02:00
|
|
|
|
2020-12-04 15:41:31 +01:00
|
|
|
long before_dedupentity = System.currentTimeMillis();
|
2020-06-11 10:46:46 +02:00
|
|
|
Deduper.createDedupEntity(
|
2021-10-22 11:21:09 +02:00
|
|
|
dedupConfig,
|
2020-12-04 15:41:31 +01:00
|
|
|
mergeRelsPath,
|
2021-10-22 11:21:09 +02:00
|
|
|
inputPath,
|
2020-06-11 10:46:46 +02:00
|
|
|
spark,
|
2021-10-22 11:21:09 +02:00
|
|
|
outputPath
|
2020-06-11 10:46:46 +02:00
|
|
|
);
|
2020-12-04 15:41:31 +01:00
|
|
|
long dedupentity_time = System.currentTimeMillis() - before_dedupentity;
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2021-10-22 11:21:09 +02:00
|
|
|
long dedupentity_number = context.textFile(outputPath).count();
|
2020-12-04 15:41:31 +01:00
|
|
|
|
|
|
|
System.out.println("Number of simrels : " + simrels_number);
|
|
|
|
System.out.println("Number of mergerels : " + mergerels_number);
|
|
|
|
System.out.println("Number of dedupentities : " + dedupentity_number);
|
|
|
|
System.out.println("Total time for simrels creation : " + simrels_time);
|
|
|
|
System.out.println("Total time for mergerels creation : " + mergerels_time);
|
|
|
|
System.out.println("Total time for dedupentity creation : " + dedupentity_time);
|
2019-11-07 12:47:12 +01:00
|
|
|
|
2021-12-27 17:35:02 +01:00
|
|
|
FileUtils.deleteDirectory(new File(workingPath));
|
2020-12-04 15:41:31 +01:00
|
|
|
}
|
2021-09-13 14:53:19 +02:00
|
|
|
|
|
|
|
@Test //test the match between two JSON
|
|
|
|
@Ignore
|
|
|
|
public void matchTest() throws Exception {
|
2022-01-13 17:20:20 +01:00
|
|
|
String json1 = "{\"context\": [], \"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"sysimport:crosswalk:repository\", \"classname\": \"sysimport:crosswalk:repository\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}, \"inferred\": true, \"inferenceprovenance\": \"dedup-similarity-result-levenstein\", \"invisible\": false, \"trust\": \"0.9\"}, \"resourcetype\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"pid\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"qualifier\": {\"classid\": \"pmc\", \"classname\": \"pmc\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"PMC1688333\"}], \"contributor\": [], \"resulttype\": {\"classid\": \"publication\", \"classname\": \"publication\", \"schemename\": \"dnet:result_typologies\", \"schemeid\": \"dnet:result_typologies\"}, \"relevantdate\": [], \"collectedfrom\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"PubMed Central\", \"key\": \"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357\"}], \"id\": \"50|od_______267::12220bb41c4f62ee8ae2ec051b22444d\", \"subject\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemename\": \"dnet:subject_classification_typologies\", \"schemeid\": \"dnet:subject_classification_typologies\"}, \"value\": \"Correspondence\"}], \"embargoenddate\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"\"}, \"lastupdatetimestamp\": 0, \"author\": [{\"surname\": \"Smith\", \"name\": \"S. L. H.\", \"pid\": [], \"rank\": 1, \"affiliation\": [], \"fullname\": \"Smith, S L H\"}], \"instance\": [{\"refereed\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"\"}, \"hostedby\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"PubMed Central\", \"key\": \"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357\"}, \"processingchargeamount\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"\"}, \"license\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"\"}, \"url\": [\"https://europepmc.org/articles/PMC1688333/\"], \"distributionlocation\": \"\", \"processingchargecurrency\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"},
|
|
|
|
String json2 = "{\"context\": [], \"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemename\": \"dnet:provenanceActions\", \"schemeid\": \"dnet:provenanceActions\"}, \"inferred\": true, \"inferenceprovenance\": \"dedup-similarity-result-levenstein\", \"invisible\": false, \"trust\": \"0.9\"}, \"resourcetype\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"pid\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"qualifier\": {\"classid\": \"doi\", \"classname\": \"doi\", \"schemename\": \"dnet:pid_types\", \"schemeid\": \"dnet:pid_types\"}, \"value\": \"10.1136/bmj.2.6039.812-d\"}], \"contributor\": [], \"resulttype\": {\"classid\": \"publication\", \"classname\": \"publication\", \"schemename\": \"dnet:result_typologies\", \"schemeid\": \"dnet:result_typologies\"}, \"relevantdate\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"qualifier\": {\"classid\": \"published-print\", \"classname\": \"published-print\", \"schemename\": \"dnet:dataCite_date\", \"schemeid\": \"dnet:dataCite_date\"}, \"value\": \"1976-10-2\"}], \"collectedfrom\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"Crossref\", \"key\": \"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2\"}, {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"Microsoft Academic Graph\", \"key\": \"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a\"}, {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"UnpayWall\", \"key\": \"10|openaire____::8ac8380272269217cb09a928c8caa993\"}], \"id\": \"50|doiboost____::faf91584ab611ecac7be042c95e82939\", \"subject\": [{\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemename\": \"dnet:subject\", \"schemeid\": \"dnet:subject\"}, \"value\": \"General Medicine\"}], \"embargoenddate\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"\"}, \"lastupdatetimestamp\": 0, \"author\": [{\"surname\": \"Campbell\", \"name\": \"A G M\", \"pid\": [{\"qualifier\": {\"classid\": \"MAG Identifier\", \"classname\": \"MAG Identifier\"}, \"value\": \"2424823041\"}], \"rank\": 1, \"affiliation\": [], \"fullname\": \"A G M Campbell\"}], \"instance\": [{\"refereed\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\": \"\", \"schemename\": \"\", \"schemeid\": \"\"}, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"\"}, \"value\": \"\"}, \"hostedby\": {\"dataInfo\": {\"deletedbyinference\": false, \"provenanceaction\": {\"classid\": \"\", \"classname\"
|
2021-09-17 10:33:29 +02:00
|
|
|
|
|
|
|
DedupConfig config = DedupConfig.load(readFileFromHDFS(Paths
|
2022-01-13 11:58:28 +01:00
|
|
|
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.new.tree.conf.json").toURI())
|
2021-09-17 10:33:29 +02:00
|
|
|
.toFile()
|
|
|
|
.getAbsolutePath()));
|
2021-09-13 14:53:19 +02:00
|
|
|
|
|
|
|
MapDocument a = MapDocumentUtil.asMapDocumentWithJPath(config, json1);
|
|
|
|
MapDocument b = MapDocumentUtil.asMapDocumentWithJPath(config, json2);
|
|
|
|
|
|
|
|
boolean result = new TreeProcessor(config).compare(a,b);
|
|
|
|
|
|
|
|
System.out.println("Tree Processor Result = " + result);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test //test the dedup of a group of JSON
|
|
|
|
@Ignore
|
|
|
|
public void dedupTest() throws Exception {
|
|
|
|
final String entitiesPath = Paths
|
2021-09-17 10:33:29 +02:00
|
|
|
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.to.fix.json").toURI())
|
2021-09-13 14:53:19 +02:00
|
|
|
.toFile()
|
|
|
|
.getAbsolutePath();
|
|
|
|
|
|
|
|
DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(Paths
|
2021-09-17 10:33:29 +02:00
|
|
|
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.prod.tree.conf.json").toURI())
|
2021-09-13 14:53:19 +02:00
|
|
|
.toFile()
|
|
|
|
.getAbsolutePath()));
|
|
|
|
|
|
|
|
JavaPairRDD<String, MapDocument> mapDocuments = context
|
|
|
|
.textFile(entitiesPath)
|
|
|
|
.mapToPair(
|
|
|
|
(PairFunction<String, String, MapDocument>) s -> {
|
|
|
|
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
|
|
|
|
return new Tuple2<>(d.getIdentifier(), d);
|
2021-09-17 10:33:29 +02:00
|
|
|
})
|
|
|
|
.reduceByKey((a,b) -> a);
|
2021-09-13 14:53:19 +02:00
|
|
|
|
|
|
|
// create blocks for deduplication
|
|
|
|
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
|
|
|
|
for (Tuple2<String, Block> b : blocks.collect()) {
|
|
|
|
System.out.println("*******GROUPS********");
|
|
|
|
System.out.println("key = " + b._1());
|
|
|
|
System.out.println("elements = " + b._2().elements());
|
|
|
|
System.out.println("items = " + b._2().getDocuments().stream().map(d -> d.getIdentifier()).collect(Collectors.joining(",")));
|
|
|
|
System.out.println("*********************");
|
|
|
|
}
|
|
|
|
|
|
|
|
// create relations by comparing only elements in the same group
|
2022-01-13 11:58:28 +01:00
|
|
|
JavaRDD<Relation> relations = Deduper.computeRelations(context, blocks, dedupConf, true, false);
|
2021-09-13 14:53:19 +02:00
|
|
|
for (Relation r: relations.collect()) {
|
|
|
|
System.out.println("*******RELATIONS*******");
|
|
|
|
System.out.println("source = " + r.getSource());
|
|
|
|
System.out.println("target = " + r.getTarget());
|
|
|
|
System.out.println("***********************");
|
|
|
|
}
|
|
|
|
|
|
|
|
//vertexes
|
|
|
|
List<String> vertexes = mapDocuments.map(doc -> doc._1()).collect();
|
|
|
|
|
|
|
|
//edges
|
|
|
|
List<Tuple2<String, String>> edges = new ArrayList<>();
|
|
|
|
relations.collect().stream().forEach(r -> edges.add(new Tuple2(r.getSource(), r.getTarget())));
|
|
|
|
|
2021-09-17 10:33:29 +02:00
|
|
|
showGraph(vertexes, edges, mapDocuments);
|
2021-09-13 14:53:19 +02:00
|
|
|
|
|
|
|
cleanup();
|
|
|
|
}
|
|
|
|
|
2021-09-17 10:33:29 +02:00
|
|
|
public void showGraph(List<String> vertexes, List<Tuple2<String, String>> edges, JavaPairRDD<String, MapDocument> mapDocuments) {
|
2021-09-13 14:53:19 +02:00
|
|
|
|
2021-09-17 10:33:29 +02:00
|
|
|
try {
|
|
|
|
prepareGraphParams(
|
|
|
|
vertexes,
|
|
|
|
edges,
|
|
|
|
"/tmp/graph.html", Paths.get(DedupLocalTest.class.getResource("/graph_visualization_tool/graph_template.html").toURI()).toFile().getAbsolutePath(),
|
|
|
|
mapDocuments.collectAsMap());
|
|
|
|
Desktop.getDesktop().browse(new File("/tmp/graph.html").toURI());
|
|
|
|
} catch (Exception e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
}
|
|
|
|
}
|
2021-09-13 14:53:19 +02:00
|
|
|
|
2021-09-17 10:33:29 +02:00
|
|
|
public int nodeDegree(String id, List<Tuple2<String, String>> edges) {
|
|
|
|
return (int) edges.stream().map(e -> e._1()).filter(s -> s.equalsIgnoreCase(id)).count();
|
|
|
|
}
|
2021-09-13 14:53:19 +02:00
|
|
|
|
2021-09-17 10:33:29 +02:00
|
|
|
public int minDegree(List<String> vertexes, List<Tuple2<String, String>> edges) {
|
2021-09-13 14:53:19 +02:00
|
|
|
|
2021-09-17 10:33:29 +02:00
|
|
|
int minDegree = 100;
|
|
|
|
for (String vertex: vertexes) {
|
|
|
|
int deg = nodeDegree(vertex, edges);
|
|
|
|
if (deg < minDegree)
|
|
|
|
minDegree = deg;
|
2021-09-13 14:53:19 +02:00
|
|
|
}
|
2021-09-17 10:33:29 +02:00
|
|
|
return minDegree;
|
2021-09-13 14:53:19 +02:00
|
|
|
}
|
|
|
|
|
2021-12-27 17:35:02 +01:00
|
|
|
@Test
|
|
|
|
@Ignore
|
2022-01-13 11:58:28 +01:00
|
|
|
public void asMapDocument() throws Exception {
|
2021-12-27 17:35:02 +01:00
|
|
|
|
|
|
|
final String json = "{\"context\": [], \"dataInfo\": {\"invisible\": false, \"trust\": \"0.9\", \"provenanceaction\": {\"classid\": \"sysimport:actionset\", \"classname\": \"sysimport:actionset\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"inferred\": false, \"deletedbyinference\": false}, \"resourcetype\": {\"classid\": \"0013\", \"classname\": \"0013\", \"schemeid\": \"dnet:dataCite_resource\", \"schemename\": \"dnet:dataCite_resource\"}, \"pid\": [{\"qualifier\": {\"classid\": \"doi\", \"classname\": \"doi\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"10.1016/b978-0-323-54696-6.00057-4\"}], \"contributor\": [], \"bestaccessright\": {\"classid\": \"RESTRICTED\", \"classname\": \"Restricted\", \"schemeid\": \"dnet:access_modes\", \"schemename\": \"dnet:access_modes\"}, \"relevantdate\": [{\"qualifier\": {\"classid\": \"created\", \"classname\": \"created\", \"schemeid\": \"dnet:dataCite_date\", \"schemename\": \"dnet:dataCite_date\"}, \"value\": \"2018-11-30T10:52:46Z\"}], \"collectedfrom\": [{\"key\": \"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2\", \"value\": \"Crossref\"}], \"id\": \"50|doiboost____::0a5280c186efacdc7c8ce845cec3fceb\", \"subject\": [], \"lastupdatetimestamp\": 1585062372132, \"author\": [{\"fullname\": \"Eric Caumes\", \"surname\": \"Caumes\", \"name\": \"Eric\", \"rank\": 1}], \"instance\": [{\"hostedby\": {\"key\": \"10|openaire____::55045bd2a65019fd8e6741a755395c8c\", \"value\": \"Unknown Repository\"}, \"license\": {\"value\": \"https://www.elsevier.com/tdm/userlicense/1.0/\"}, \"url\": [\"https://api.elsevier.com/content/article/PII:B9780323546966000574?httpAccept=text/xml\", \"https://api.elsevier.com/content/article/PII:B9780323546966000574?httpAccept=text/plain\", \"http://dx.doi.org/10.1016/b978-0-323-54696-6.00057-4\"], \"dateofacceptance\": {\"value\": \"2018-11-30T10:52:46Z\"}, \"collectedfrom\": {\"key\": \"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2\", \"value\": \"Crossref\"}, \"accessright\": {\"classid\": \"RESTRICTED\", \"classname\": \"Restricted\", \"schemeid\": \"dnet:access_modes\", \"schemename\": \"dnet:access_modes\"}, \"instancetype\": {\"classid\": \"0013\", \"classname\": \"Part of book or chapter of book\", \"schemeid\": \"dnet:publication_resource\", \"schemename\": \"dnet:publication_resource\"}}], \"dateofcollection\": \"2020-03-24T15:06:12Z\", \"fulltext\": [], \"description\": [], \"format\": [], \"measures\": [], \"coverage\": [], \"externalReference\": [], \"publisher\": {\"value\": \"Elsevier\"}, \"resulttype\": {\"classid\": \"publication\", \"classname\": \"publication\", \"schemeid\": \"dnet:result_typologies\", \"schemename\": \"dnet:result_typologies\"}, \"country\": [], \"extraInfo\": [], \"originalId\": [\"10.1016/b978-0-323-54696-6.00057-4\"], \"source\": [{\"value\": \"Crossref\"}, {\"value\": \"Travel Medicine ISBN: 9780323546966\"}], \"dateofacceptance\": {\"value\": \"2018-11-30T10:52:46Z\"}, \"title\": [{\"qualifier\": {\"classid\": \"main title\", \"classname\": \"main title\", \"schemeid\": \"dnet:dataCite_title\", \"schemename\": \"dnet:dataCite_title\"}, \"value\": \"Skin Diseases\"}]}\n";
|
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
|
|
|
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json").toURI()).toFile().getAbsolutePath()
|
|
|
|
));
|
2021-12-27 17:35:02 +01:00
|
|
|
|
2022-01-13 11:58:28 +01:00
|
|
|
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, json);
|
2021-12-27 17:35:02 +01:00
|
|
|
|
|
|
|
for(String field: mapDocument.getFieldMap().keySet()) {
|
|
|
|
System.out.println(field + ": " + mapDocument.getFieldMap().get(field).stringValue());
|
|
|
|
}
|
|
|
|
}
|
2022-01-13 11:58:28 +01:00
|
|
|
|
|
|
|
@Test
|
|
|
|
@Ignore
|
|
|
|
public void noMatchTest() throws Exception {
|
|
|
|
|
|
|
|
//custom parameters for this test
|
|
|
|
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
|
|
|
|
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.new.tree.conf.json").toURI()).toFile().getAbsolutePath()
|
|
|
|
));
|
|
|
|
// String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath();
|
|
|
|
String inputPath = "/Users/miconis/IdeaProjects/DnetHadoop/dnet-hadoop/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/entities/publication/publication.gz";
|
|
|
|
String simRelsPath = workingPath + "/simrels";
|
|
|
|
|
|
|
|
Deduper.createSimRels(
|
|
|
|
dedupConfig,
|
|
|
|
spark,
|
|
|
|
inputPath,
|
|
|
|
simRelsPath,
|
|
|
|
true,
|
|
|
|
true
|
|
|
|
);
|
|
|
|
Dataset<Relation> noMatches = spark.read().load(simRelsPath).as(Encoders.bean(Relation.class));
|
|
|
|
|
|
|
|
System.out.println("noMatches = " + noMatches.count());
|
|
|
|
|
|
|
|
noMatches.foreach((ForeachFunction<Relation>) r -> System.out.println(r.getSource() + " " + r.getTarget()));
|
|
|
|
|
|
|
|
FileUtils.deleteDirectory(new File(workingPath));
|
|
|
|
}
|
|
|
|
}
|