package eu.dnetlib.pace; import eu.dnetlib.Deduper; import eu.dnetlib.jobs.SparkCreateDedupEntity; import eu.dnetlib.jobs.SparkCreateMergeRels; import eu.dnetlib.jobs.SparkCreateSimRels; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.tree.support.TreeProcessor; import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.support.ArgumentApplicationParser; import eu.dnetlib.support.Block; import eu.dnetlib.support.Relation; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.*; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.junit.jupiter.MockitoExtension; import scala.Tuple2; import java.awt.*; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.URISyntaxException; import java.nio.file.Paths; import java.util.*; import java.util.List; import java.util.stream.Collectors; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class DedupLocalTest extends DedupTestUtils { static SparkSession spark; static DedupConfig config; static JavaSparkContext context; final String entitiesPath = Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()) .toFile() .getAbsolutePath(); final static String workingPath = "/tmp/working_dir"; final static String numPartitions = "20"; final String dedupConfPath = Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI()) .toFile() .getAbsolutePath(); final static String simRelsPath = workingPath + "/simrels"; final static String mergeRelsPath = workingPath + "/mergerels"; final static String dedupEntityPath = workingPath + "/dedupentities"; public DedupLocalTest() throws URISyntaxException { } public static void cleanup() throws IOException { //remove directories to clean workspace FileUtils.deleteDirectory(new File(simRelsPath)); FileUtils.deleteDirectory(new File(mergeRelsPath)); FileUtils.deleteDirectory(new File(dedupEntityPath)); } @BeforeAll public void setup() throws IOException { cleanup(); config = DedupConfig.load(readFileFromHDFS(dedupConfPath)); spark = SparkSession .builder() .appName("Deduplication") .master("local[*]") .getOrCreate(); context = JavaSparkContext.fromSparkContext(spark.sparkContext()); } @AfterAll public static void finalCleanUp() throws IOException { cleanup(); } protected static String readFileFromHDFS(String filePath) throws IOException { Path path=new Path(filePath); FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path))); try { return String.join("", br.lines().collect(Collectors.toList())); } finally { br.close(); } } @Test @Order(1) public void createSimRelTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createSimRels_parameters.json", SparkCreateSimRels.class)); parser.parseArgument( new String[] { "-e", entitiesPath, "-w", workingPath, "-np", numPartitions, "-dc", dedupConfPath, "-ut", "true" }); new SparkCreateSimRels( parser, spark ).run(); long simrels_number = spark.read().load(simRelsPath).count(); System.out.println("simrels_number = " + simrels_number); } @Test @Order(2) public void createMergeRelTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createMergeRels_parameters.json", SparkCreateMergeRels.class)); parser.parseArgument( new String[] { "-e", entitiesPath, "-w", workingPath, "-np", numPartitions, "-dc", dedupConfPath }); new SparkCreateMergeRels( parser, spark ).run(); } @Test @Order(3) public void createDedupEntityTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)); parser.parseArgument( new String[] { "-e", entitiesPath, "-w", workingPath, "-np", numPartitions, "-dc", dedupConfPath }); new SparkCreateDedupEntity( parser, spark ).run(); } @Test //full deduplication workflow test @Disabled public void deduplicationTest() throws Exception { //custom parameters for this test DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS( Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI()).toFile().getAbsolutePath() )); String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/software.dump.2000.json").toURI()).toFile().getAbsolutePath(); String simRelsPath = workingPath + "/simrels"; String mergeRelsPath = workingPath + "/mergerels"; String outputPath = workingPath + "/dedup"; long before_simrels = System.currentTimeMillis(); Deduper.createSimRels( dedupConfig, spark, inputPath, simRelsPath, true, false ); long simrels_time = System.currentTimeMillis() - before_simrels; long simrels_number = spark.read().load(simRelsPath).count(); long before_mergerels = System.currentTimeMillis(); Deduper.createMergeRels( dedupConfig, inputPath, mergeRelsPath, simRelsPath, spark ); long mergerels_time = System.currentTimeMillis() - before_mergerels; long mergerels_number = spark.read().load(mergeRelsPath).count(); long before_dedupentity = System.currentTimeMillis(); Deduper.createDedupEntity( dedupConfig, mergeRelsPath, inputPath, spark, outputPath ); long dedupentity_time = System.currentTimeMillis() - before_dedupentity; long dedupentity_number = context.textFile(outputPath).count(); System.out.println("Number of simrels : " + simrels_number); System.out.println("Number of mergerels : " + mergerels_number); System.out.println("Number of dedupentities : " + dedupentity_number); System.out.println("Total time for simrels creation : " + simrels_time); System.out.println("Total time for mergerels creation : " + mergerels_time); System.out.println("Total time for dedupentity creation : " + dedupentity_time); // FileUtils.deleteDirectory(new File(workingPath)); } @Test //test the match between two JSON @Disabled public void matchTest() throws Exception { String json1 = "{\"author\":[{\"affiliation\":[],\"fullname\":\"Hanayik, Taylor\",\"name\":\"Taylor\",\"pid\":[],\"rank\":1,\"surname\":\"Hanayik\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Drake, Christopher\",\"name\":\"Christopher\",\"pid\":[],\"rank\":2,\"surname\":\"Drake\"},{\"affiliation\":[],\"fullname\":\"Rorden, Chris\",\"name\":\"Chris\",\"pid\":[],\"rank\":3,\"surname\":\"Rorden\"},{\"affiliation\":[],\"fullname\":\"Hardcastle, Nell\",\"name\":\"Nell\",\"pid\":[],\"rank\":4,\"surname\":\"Hardcastle\"},{\"affiliation\":[],\"fullname\":\"Androulakis, Anthony\",\"name\":\"Anthony\",\"pid\":[],\"rank\":5,\"surname\":\"Androulakis\"}],\"bestaccessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":[{\"key\":\"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254\",\"value\":\"Datacite\"}],\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"dateofacceptance\":{\"value\":\"2022-03-02\"},\"dateofcollection\":\"2022-03-02T11:25:20+0000\",\"dateoftransformation\":\"2022-03-02T11:25:20+0000\",\"description\":[{\"value\":\"a WebGL2 based NIFTI volume viewer\"}],\"id\":\"50|doi_________::b596bfb411bbc62b902aedb11d0088d8\",\"instance\":[{\"accessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":{\"key\":\"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254\",\"value\":\"Datacite\"},\"dateofacceptance\":{\"value\":\"2022-03-02\"},\"hostedby\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"instancetype\":{\"classid\":\"0029\",\"classname\":\"Software\",\"schemeid\":\"dnet:publication_resource\",\"schemename\":\"dnet:publication_resource\"},\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786269\"}],\"refereed\":{\"classid\":\"0000\",\"classname\":\"Unknown\",\"schemeid\":\"dnet:review_levels\",\"schemename\":\"dnet:review_levels\"},\"url\":[\"https://dx.doi.org/10.5281/zenodo.5786269\"]}],\"language\":{\"classid\":\"und\",\"classname\":\"Undetermined\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"originalId\":[\"10.5281/zenodo.5786269\"],\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786269\"}],\"publisher\":{\"value\":\"Zenodo\"},\"relevantdate\":[{\"qualifier\":{\"classid\":\"issued\",\"classname\":\"issued\",\"schemeid\":\"dnet:dataCite_date\",\"schemename\":\"dnet:dataCite_date\"},\"value\":\"2022-03-02\"}],\"resourcetype\":{\"classid\":\"UNKNOWN\",\"classname\":\"Unknown\",\"schemeid\":\"dnet:dataCite_resource\",\"schemename\":\"dnet:dataCite_resource\"},\"resulttype\":{\"classid\":\"software\",\"classname\":\"software\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"subject\":[],\"title\":[{\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"value\":\"niivue/niivue: 0.21.1\"}]}"; String json2 = "{\"author\":[{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Chris Rorden\",\"name\":\"\",\"pid\":[],\"rank\":1,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of Oxford\"}],\"fullname\":\"Taylor Hanayik\",\"name\":\"\",\"pid\":[],\"rank\":2,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Christopher Drake\",\"name\":\"\",\"pid\":[],\"rank\":3,\"surname\":\"\"},{\"affiliation\":[],\"fullname\":\"Nell Hardcastle\",\"name\":\"\",\"pid\":[],\"rank\":4,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Roger Newman-Norlund\",\"name\":\"\",\"pid\":[],\"rank\":5,\"surname\":\"\"}],\"bestaccessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":[{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"}],\"context\":[],\"contributor\":[],\"country\":[],\"coverage\":[],\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"dateofacceptance\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"2021-12-16\"},\"dateofcollection\":\"2022-11-11T00:19:05+0000\",\"dateoftransformation\":\"2022-11-11T07:32:42.689Z\",\"description\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"
a WebGL2 based NIFTI volume viewer
\"}],\"documentationUrl\":[],\"eoscifguidelines\":[],\"externalReference\":[],\"extraInfo\":[],\"format\":[],\"fulltext\":[],\"id\":\"50|doi_________::59a7f397515febc2c7017fd2f866a777\",\"instance\":[{\"accessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"alternateIdentifier\":[],\"collectedfrom\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"dateofacceptance\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"2021-12-16\"},\"hostedby\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"instancetype\":{\"classid\":\"0029\",\"classname\":\"Software\",\"schemeid\":\"dnet:publication_resource\",\"schemename\":\"dnet:publication_resource\"},\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786270\"}],\"refereed\":{\"classid\":\"0000\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:review_levels\",\"schemename\":\"dnet:review_levels\"},\"url\":[\"http://dx.doi.org/10.5281/zenodo.5786270\",\"https://doi.org/10.5281/zenodo.5786270\"]}],\"language\":{\"classid\":\"UNKNOWN\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"lastupdatetimestamp\":1668556279928,\"license\":[],\"originalId\":[\"oai:zenodo.org:5786270\",\"50|od______2659::c8a3ff31e2ff537ebac22cdc4d7c64b0\"],\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786270\"}],\"programmingLanguage\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"},\"publisher\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"Zenodo\"},\"relevantdate\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"issued\",\"classname\":\"issued\",\"schemeid\":\"dnet:dataCite_date\",\"schemename\":\"dnet:dataCite_date\"},\"value\":\"2021-12-16\"}],\"resourcetype\":{\"classid\":\"UNKNOWN\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:dataCite_resource\",\"schemename\":\"dnet:dataCite_resource\"},\"resulttype\":{\"classid\":\"software\",\"classname\":\"software\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"source\":[],\"subject\":[],\"title\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"value\":\"niivue/niivue: 0.13.0\"}]}"; DedupConfig config = DedupConfig.load(readFileFromHDFS(Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI()) .toFile() .getAbsolutePath())); MapDocument a = MapDocumentUtil.asMapDocumentWithJPath(config, json1); MapDocument b = MapDocumentUtil.asMapDocumentWithJPath(config, json2); boolean result = new TreeProcessor(config).compare(a,b); System.out.println("Tree Processor Result = " + result); } @Test //test the dedup of a group of JSON @Disabled public void dedupTest() throws Exception { final String entitiesPath = Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/software.to.fix.json").toURI()) .toFile() .getAbsolutePath(); DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI()) .toFile() .getAbsolutePath())); JavaPairRDD