package eu.dnetlib.pace; import eu.dnetlib.Deduper; import eu.dnetlib.jobs.SparkCreateDedupEntity; import eu.dnetlib.jobs.SparkCreateMergeRels; import eu.dnetlib.jobs.SparkCreateSimRels; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.tree.support.TreeProcessor; import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.utils.Utility; import eu.dnetlib.support.ArgumentApplicationParser; import eu.dnetlib.support.Block; import eu.dnetlib.support.Relation; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.*; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.junit.jupiter.MockitoExtension; import scala.Tuple2; import java.awt.*; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.URISyntaxException; import java.nio.file.Paths; import java.util.*; import java.util.List; import java.util.stream.Collectors; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class DedupLocalTest extends DedupTestUtils { static SparkSession spark; static DedupConfig config; static JavaSparkContext context; final String entitiesPath = Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()) .toFile() .getAbsolutePath(); final static String workingPath = "/tmp/working_dir"; final static String numPartitions = "20"; final String dedupConfPath = Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI()) .toFile() .getAbsolutePath(); final static String simRelsPath = workingPath + "/simrels"; final static String mergeRelsPath = workingPath + "/mergerels"; final static String dedupEntityPath = workingPath + "/dedupentities"; public DedupLocalTest() throws URISyntaxException { } public static void cleanup() throws IOException { //remove directories to clean workspace FileUtils.deleteDirectory(new File(simRelsPath)); FileUtils.deleteDirectory(new File(mergeRelsPath)); FileUtils.deleteDirectory(new File(dedupEntityPath)); } @BeforeAll public void setup() throws IOException { cleanup(); config = DedupConfig.load(readFileFromHDFS(dedupConfPath)); spark = SparkSession .builder() .appName("Deduplication") .master("local[*]") .getOrCreate(); context = JavaSparkContext.fromSparkContext(spark.sparkContext()); } @AfterAll public static void finalCleanUp() throws IOException { cleanup(); } protected static String readFileFromHDFS(String filePath) throws IOException { Path path=new Path(filePath); FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path))); try { return String.join("", br.lines().collect(Collectors.toList())); } finally { br.close(); } } @Test @Order(1) public void createSimRelTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createSimRels_parameters.json", SparkCreateSimRels.class)); parser.parseArgument( new String[] { "-e", entitiesPath, "-w", workingPath, "-np", numPartitions, "-dc", dedupConfPath, "-ut", "true" }); new SparkCreateSimRels( parser, spark ).run(); long simrels_number = spark.read().load(simRelsPath).count(); System.out.println("simrels_number = " + simrels_number); } @Test @Order(2) public void createMergeRelTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createMergeRels_parameters.json", SparkCreateMergeRels.class)); parser.parseArgument( new String[] { "-e", entitiesPath, "-w", workingPath, "-np", numPartitions, "-dc", dedupConfPath }); new SparkCreateMergeRels( parser, spark ).run(); } @Test @Order(3) public void createDedupEntityTest() throws Exception { ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)); parser.parseArgument( new String[] { "-e", entitiesPath, "-w", workingPath, "-np", numPartitions, "-dc", dedupConfPath }); new SparkCreateDedupEntity( parser, spark ).run(); } @Test //full deduplication workflow test @Disabled public void deduplicationTest() throws Exception { //custom parameters for this test DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS( Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI()).toFile().getAbsolutePath() )); String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/software.dump.2000.json").toURI()).toFile().getAbsolutePath(); String simRelsPath = workingPath + "/simrels"; String mergeRelsPath = workingPath + "/mergerels"; String outputPath = workingPath + "/dedup"; long before_simrels = System.currentTimeMillis(); Deduper.createSimRels( dedupConfig, spark, inputPath, simRelsPath, true, false ); long simrels_time = System.currentTimeMillis() - before_simrels; long simrels_number = spark.read().load(simRelsPath).count(); long before_mergerels = System.currentTimeMillis(); Deduper.createMergeRels( dedupConfig, inputPath, mergeRelsPath, simRelsPath, spark ); long mergerels_time = System.currentTimeMillis() - before_mergerels; long mergerels_number = spark.read().load(mergeRelsPath).count(); long before_dedupentity = System.currentTimeMillis(); Deduper.createDedupEntity( dedupConfig, mergeRelsPath, inputPath, spark, outputPath ); long dedupentity_time = System.currentTimeMillis() - before_dedupentity; long dedupentity_number = context.textFile(outputPath).count(); System.out.println("Number of simrels : " + simrels_number); System.out.println("Number of mergerels : " + mergerels_number); System.out.println("Number of dedupentities : " + dedupentity_number); System.out.println("Total time for simrels creation : " + simrels_time); System.out.println("Total time for mergerels creation : " + mergerels_time); System.out.println("Total time for dedupentity creation : " + dedupentity_time); // FileUtils.deleteDirectory(new File(workingPath)); } @Test //test the match between two JSON @Disabled public void matchTest() throws Exception { String json1 = "{\"author\":[{\"affiliation\":[],\"fullname\":\"Hanayik, Taylor\",\"name\":\"Taylor\",\"pid\":[],\"rank\":1,\"surname\":\"Hanayik\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Drake, Christopher\",\"name\":\"Christopher\",\"pid\":[],\"rank\":2,\"surname\":\"Drake\"},{\"affiliation\":[],\"fullname\":\"Rorden, Chris\",\"name\":\"Chris\",\"pid\":[],\"rank\":3,\"surname\":\"Rorden\"},{\"affiliation\":[],\"fullname\":\"Hardcastle, Nell\",\"name\":\"Nell\",\"pid\":[],\"rank\":4,\"surname\":\"Hardcastle\"},{\"affiliation\":[],\"fullname\":\"Androulakis, Anthony\",\"name\":\"Anthony\",\"pid\":[],\"rank\":5,\"surname\":\"Androulakis\"}],\"bestaccessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":[{\"key\":\"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254\",\"value\":\"Datacite\"}],\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"dateofacceptance\":{\"value\":\"2022-03-02\"},\"dateofcollection\":\"2022-03-02T11:25:20+0000\",\"dateoftransformation\":\"2022-03-02T11:25:20+0000\",\"description\":[{\"value\":\"a WebGL2 based NIFTI volume viewer\"}],\"id\":\"50|doi_________::b596bfb411bbc62b902aedb11d0088d8\",\"instance\":[{\"accessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":{\"key\":\"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254\",\"value\":\"Datacite\"},\"dateofacceptance\":{\"value\":\"2022-03-02\"},\"hostedby\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"instancetype\":{\"classid\":\"0029\",\"classname\":\"Software\",\"schemeid\":\"dnet:publication_resource\",\"schemename\":\"dnet:publication_resource\"},\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786269\"}],\"refereed\":{\"classid\":\"0000\",\"classname\":\"Unknown\",\"schemeid\":\"dnet:review_levels\",\"schemename\":\"dnet:review_levels\"},\"url\":[\"https://dx.doi.org/10.5281/zenodo.5786269\"]}],\"language\":{\"classid\":\"und\",\"classname\":\"Undetermined\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"originalId\":[\"10.5281/zenodo.5786269\"],\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786269\"}],\"publisher\":{\"value\":\"Zenodo\"},\"relevantdate\":[{\"qualifier\":{\"classid\":\"issued\",\"classname\":\"issued\",\"schemeid\":\"dnet:dataCite_date\",\"schemename\":\"dnet:dataCite_date\"},\"value\":\"2022-03-02\"}],\"resourcetype\":{\"classid\":\"UNKNOWN\",\"classname\":\"Unknown\",\"schemeid\":\"dnet:dataCite_resource\",\"schemename\":\"dnet:dataCite_resource\"},\"resulttype\":{\"classid\":\"software\",\"classname\":\"software\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"subject\":[],\"title\":[{\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"value\":\"niivue/niivue: 0.21.1\"}]}"; String json2 = "{\"author\":[{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Chris Rorden\",\"name\":\"\",\"pid\":[],\"rank\":1,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of Oxford\"}],\"fullname\":\"Taylor Hanayik\",\"name\":\"\",\"pid\":[],\"rank\":2,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Christopher Drake\",\"name\":\"\",\"pid\":[],\"rank\":3,\"surname\":\"\"},{\"affiliation\":[],\"fullname\":\"Nell Hardcastle\",\"name\":\"\",\"pid\":[],\"rank\":4,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Roger Newman-Norlund\",\"name\":\"\",\"pid\":[],\"rank\":5,\"surname\":\"\"}],\"bestaccessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":[{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"}],\"context\":[],\"contributor\":[],\"country\":[],\"coverage\":[],\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"dateofacceptance\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"2021-12-16\"},\"dateofcollection\":\"2022-11-11T00:19:05+0000\",\"dateoftransformation\":\"2022-11-11T07:32:42.689Z\",\"description\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"

a WebGL2 based NIFTI volume viewer

\"}],\"documentationUrl\":[],\"eoscifguidelines\":[],\"externalReference\":[],\"extraInfo\":[],\"format\":[],\"fulltext\":[],\"id\":\"50|doi_________::59a7f397515febc2c7017fd2f866a777\",\"instance\":[{\"accessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"alternateIdentifier\":[],\"collectedfrom\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"dateofacceptance\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"2021-12-16\"},\"hostedby\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"instancetype\":{\"classid\":\"0029\",\"classname\":\"Software\",\"schemeid\":\"dnet:publication_resource\",\"schemename\":\"dnet:publication_resource\"},\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786270\"}],\"refereed\":{\"classid\":\"0000\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:review_levels\",\"schemename\":\"dnet:review_levels\"},\"url\":[\"http://dx.doi.org/10.5281/zenodo.5786270\",\"https://doi.org/10.5281/zenodo.5786270\"]}],\"language\":{\"classid\":\"UNKNOWN\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"lastupdatetimestamp\":1668556279928,\"license\":[],\"originalId\":[\"oai:zenodo.org:5786270\",\"50|od______2659::c8a3ff31e2ff537ebac22cdc4d7c64b0\"],\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786270\"}],\"programmingLanguage\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"},\"publisher\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"Zenodo\"},\"relevantdate\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"issued\",\"classname\":\"issued\",\"schemeid\":\"dnet:dataCite_date\",\"schemename\":\"dnet:dataCite_date\"},\"value\":\"2021-12-16\"}],\"resourcetype\":{\"classid\":\"UNKNOWN\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:dataCite_resource\",\"schemename\":\"dnet:dataCite_resource\"},\"resulttype\":{\"classid\":\"software\",\"classname\":\"software\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"source\":[],\"subject\":[],\"title\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"value\":\"niivue/niivue: 0.13.0\"}]}"; DedupConfig config = DedupConfig.load(readFileFromHDFS(Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI()) .toFile() .getAbsolutePath())); MapDocument a = MapDocumentUtil.asMapDocumentWithJPath(config, json1); MapDocument b = MapDocumentUtil.asMapDocumentWithJPath(config, json2); boolean result = new TreeProcessor(config).compare(a,b); System.out.println("Tree Processor Result = " + result); } @Test //test the dedup of a group of JSON @Disabled public void dedupTest() throws Exception { final String entitiesPath = Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/software.to.fix.json").toURI()) .toFile() .getAbsolutePath(); DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(Paths .get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI()) .toFile() .getAbsolutePath())); JavaPairRDD mapDocuments = context .textFile(entitiesPath) .mapToPair( (PairFunction) s -> { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); return new Tuple2<>(d.getIdentifier(), d); }) .reduceByKey((a,b) -> a); // create blocks for deduplication JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); for (Tuple2 b : blocks.collect()) { System.out.println("*******GROUPS********"); System.out.println("key = " + b._1()); System.out.println("elements = " + b._2().elements()); System.out.println("items = " + b._2().getDocuments().stream().map(d -> d.getIdentifier()).collect(Collectors.joining(","))); System.out.println("*********************"); } // create relations by comparing only elements in the same group JavaRDD relations = Deduper.computeRelations(context, blocks, dedupConf, true, false); for (Relation r: relations.collect()) { System.out.println("*******RELATIONS*******"); System.out.println("source = " + r.getSource()); System.out.println("target = " + r.getTarget()); System.out.println("***********************"); } //vertexes List vertexes = mapDocuments.map(doc -> doc._1()).collect(); //edges List> edges = new ArrayList<>(); relations.collect().stream().forEach(r -> edges.add(new Tuple2(r.getSource(), r.getTarget()))); showGraph(vertexes, edges, mapDocuments); cleanup(); } public void showGraph(List vertexes, List> edges, JavaPairRDD mapDocuments) { try { prepareGraphParams( vertexes, edges, "/tmp/graph.html", Paths.get(DedupLocalTest.class.getResource("/graph_visualization_tool/graph_template.html").toURI()).toFile().getAbsolutePath(), mapDocuments.collectAsMap()); Desktop.getDesktop().browse(new File("/tmp/graph.html").toURI()); } catch (Exception e) { e.printStackTrace(); } } public int nodeDegree(String id, List> edges) { return (int) edges.stream().map(e -> e._1()).filter(s -> s.equalsIgnoreCase(id)).count(); } public int minDegree(List vertexes, List> edges) { int minDegree = 100; for (String vertex: vertexes) { int deg = nodeDegree(vertex, edges); if (deg < minDegree) minDegree = deg; } return minDegree; } @Test @Disabled public void asMapDocument() throws Exception { final String json = "{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": true, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"resourcetype\": {\"classid\": \"0001\", \"classname\": \"0001\", \"schemeid\": \"dnet:dataCite_resource\", \"schemename\": \"dnet:dataCite_resource\"}, \"pid\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"handle\", \"classname\": \"Handle\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"11370/8fb3e34b-47c4-4e87-a675-c889db060e19\"}], \"contributor\": [], \"bestaccessright\": {\"classid\": \"CLOSED\", \"classname\": \"Closed Access\", \"schemeid\": \"dnet:access_modes\", \"schemename\": \"dnet:access_modes\"}, \"relevantdate\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"issued\", \"classname\": \"issued\", \"schemeid\": \"dnet:dataCite_date\", \"schemename\": \"dnet:dataCite_date\"}, \"value\": \"2018-01-01\"}], \"collectedfrom\": [{\"dataInfo\": null, \"key\": \"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f\", \"value\": \"DANS (Data Archiving and Networked Services)\"}], \"id\": \"50|DansKnawCris::4ad8a5701b7d06b851966e1b323a2a95\", \"subject\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"OF-THE-ART\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"SCHOOL LEADERSHIP\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"DYNAMIC-MODEL\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"IMPLEMENTATION\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"OUTCOMES\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"To be checked by Faculty\"}], \"embargoenddate\": null, \"lastupdatetimestamp\": 1645007805304, \"author\": [{\"surname\": \"Kyriakides\", \"name\": \"Leonidas\", \"pid\": [], \"rank\": 1, \"affiliation\": [], \"fullname\": \"Kyriakides, Leonidas\"}, {\"surname\": \"Georgiou\", \"name\": \"Maria P.\", \"pid\": [], \"rank\": 2, \"affiliation\": [], \"fullname\": \"Georgiou, Maria P.\"}, {\"surname\": \"Creemers\", \"name\": \"Bert P. M.\", \"pid\": [], \"rank\": 3, \"affiliation\": [], \"fullname\": \"Creemers, Bert P. M.\"}, {\"surname\": \"Panayiotou\", \"name\": \"Anastasia\", \"pid\": [], \"rank\": 4, \"affiliation\": [], \"fullname\": \"Panayiotou, Anastasia\"}, {\"surname\": \"Reynolds\", \"name\": \"David\", \"pid\": [], \"rank\": 5, \"affiliation\": [], \"fullname\": \"Reynolds, David\"}], \"instance\": [{\"refereed\": {\"classid\": \"0000\", \"classname\": \"UNKNOWN\", \"schemeid\": \"dnet:review_levels\", \"schemename\": \"dnet:review_levels\"}, \"hostedby\": {\"dataInfo\": null, \"key\": \"10|issn___print::2763d4e6e2e870a7ffd4bf8863c6bbff\", \"value\": \"School Effectiveness and School Improvement\"}, \"accessright\": {\"classid\": \"CLOSED\", \"classname\": \"Closed Access\", \"schemeid\": \"dnet:access_modes\", \"schemename\": \"dnet:access_modes\", \"openAccessRoute\": null}, \"license\": null, \"url\": [\"\", \"http://dx.doi.org/10.1080/09243453.2017.1398761\"], \"measures\": null, \"pid\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"handle\", \"classname\": \"Handle\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"11370/8fb3e34b-47c4-4e87-a675-c889db060e19\"}], \"distributionlocation\": null, \"processingchargecurrency\": null, \"alternateIdentifier\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"urn\", \"classname\": \"urn\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"urn:nbn:nl:ui:11-8fb3e34b-47c4-4e87-a675-c889db060e19\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"doi\", \"classname\": \"Digital Object Identifier\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"10.1080/09243453.2017.1398761\"}], \"dateofacceptance\": {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"value\": \"2018-01-01\"}, \"collectedfrom\": {\"dataInfo\": null, \"key\": \"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f\", \"value\": \"DANS (Data Archiving and Networked Services)\"}, \"processingchargeamount\": null, \"instancetype\": {\"classid\": \"0001\", \"classname\": \"Article\", \"schemeid\": \"dnet:publication_resource\", \"schemename\": \"dnet:publication_resource\"}}], \"resulttype\": {\"classid\": \"publication\", \"classname\": \"publication\", \"schemeid\": \"dnet:result_typologies\", \"schemename\": \"dnet:result_typologies\"}, \"dateofcollection\": \"2020-11-12T23:04:09.482Z\", \"fulltext\": [], \"dateoftransformation\": \"2020-11-13T01:33:00.881Z\", \"description\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"value\": \"This paper investigates the impact of national policies for improving teaching and the school learning environment (SLE) on student achievement. In each participating country (i.e., Belgium/Flanders, Cyprus, Germany, Greece, Ireland, and Slovenia), a sample of at least 50 schools was drawn and tests in mathematics and science were administered to all Grade 4 students (N = 10,742) at the beginning and end of school year 2010\\u20132011. National policies were measured through (a) content analysis of policy documents, (b) interviews with policymakers, and (c) head-teacher questionnaires. Multilevel analyses revealed that most aspects of national policies for teaching and SLE were associated with student achievement in each subject irrespective of the source of data used to measure them. Implications are, finally, drawn.\"}], \"format\": [], \"processingchargecurrency\": null, \"journal\": {\"issnPrinted\": \"0924-3453\", \"conferencedate\": null, \"vol\": \"29\", \"conferenceplace\": null, \"name\": \"School Effectiveness and School Improvement\", \"iss\": \"2\", \"sp\": \"171\", \"edition\": \"\", \"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"issnOnline\": \"\", \"ep\": \"203\", \"issnLinking\": \"\"}, \"measures\": null, \"dateofacceptance\": {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"value\": \"2018-01-01\"}, \"coverage\": [], \"processingchargeamount\": null, \"externalReference\": [], \"publisher\": null, \"language\": {\"classid\": \"eng\", \"classname\": \"English\", \"schemeid\": \"dnet:languages\", \"schemename\": \"dnet:languages\"}, \"oaiprovenance\": {\"originDescription\": {\"metadataNamespace\": \"\", \"harvestDate\": \"2020-11-12T23:04:09.482Z\", \"baseURL\": \"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif\", \"datestamp\": \"2020-11-12T00:51:43Z\", \"altered\": true, \"identifier\": \"oai:services.nod.dans.knaw.nl:Publications/rug:oai:pure.rug.nl:publications/8fb3e34b-47c4-4e87-a675-c889db060e19\"}}, \"country\": [], \"extraInfo\": [], \"originalId\": [\"oai:services.nod.dans.knaw.nl:Publications/rug:oai:pure.rug.nl:publications/8fb3e34b-47c4-4e87-a675-c889db060e19\", \"50|DansKnawCris::4ad8a5701b7d06b851966e1b323a2a95\"], \"source\": [], \"context\": [], \"title\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"main title\", \"classname\": \"main title\", \"schemeid\": \"dnet:dataCite_title\", \"schemename\": \"dnet:dataCite_title\"}, \"value\": \"The impact of national educational policies on student achievement\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"subtitle\", \"classname\": \"subtitle\", \"schemeid\": \"dnet:dataCite_title\", \"schemename\": \"dnet:dataCite_title\"}, \"value\": \"a European study\"}]}"; DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS( Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.new.tree.conf.json").toURI()).toFile().getAbsolutePath() )); final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, json); for(String field: mapDocument.getFieldMap().keySet()) { System.out.println(field + ": " + mapDocument.getFieldMap().get(field).stringValue()); } } @Test @Disabled public void noMatchTest() throws Exception { //custom parameters for this test DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS( Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.new.tree.conf.json").toURI()).toFile().getAbsolutePath() )); String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath(); String simRelsPath = workingPath + "/simrels"; Deduper.createSimRels( dedupConfig, spark, inputPath, simRelsPath, true, true ); Dataset noMatches = spark.read().load(simRelsPath).as(Encoders.bean(Relation.class)); System.out.println("noMatches = " + noMatches.count()); noMatches.foreach((ForeachFunction) r -> System.out.println(r.getSource() + " " + r.getTarget())); FileUtils.deleteDirectory(new File(workingPath)); } }