dnet-dedup/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupLocalTest.java

384 lines
39 KiB
Java

package eu.dnetlib.pace;
import eu.dnetlib.Deduper;
import eu.dnetlib.jobs.SparkCreateDedupEntity;
import eu.dnetlib.jobs.SparkCreateMergeRels;
import eu.dnetlib.jobs.SparkCreateSimRels;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.Block;
import eu.dnetlib.support.Relation;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.junit.jupiter.MockitoExtension;
import scala.Tuple2;
import java.awt.*;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.*;
import java.util.List;
import java.util.stream.Collectors;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
public class DedupLocalTest extends DedupTestUtils {
static SparkSession spark;
static DedupConfig config;
static JavaSparkContext context;
final String entitiesPath = Paths
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI())
.toFile()
.getAbsolutePath();
final static String workingPath = "/tmp/working_dir";
final static String numPartitions = "20";
final String dedupConfPath = Paths
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pubs.fdup.exp.json").toURI())
.toFile()
.getAbsolutePath();
final static String simRelsPath = workingPath + "/simrels";
final static String mergeRelsPath = workingPath + "/mergerels";
final static String dedupEntityPath = workingPath + "/dedupentities";
public DedupLocalTest() throws URISyntaxException {
}
public static void cleanup() throws IOException {
//remove directories to clean workspace
FileUtils.deleteDirectory(new File(simRelsPath));
FileUtils.deleteDirectory(new File(mergeRelsPath));
FileUtils.deleteDirectory(new File(dedupEntityPath));
}
@BeforeAll
public void setup() throws IOException {
cleanup();
config = DedupConfig.load(readFileFromHDFS(dedupConfPath));
spark = SparkSession
.builder()
.appName("Deduplication")
.master("local[*]")
.getOrCreate();
context = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
@AfterAll
public static void finalCleanUp() throws IOException {
cleanup();
}
protected static String readFileFromHDFS(String filePath) throws IOException {
Path path=new Path(filePath);
FileSystem fs = FileSystem.get(new Configuration());
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
try {
return String.join("", br.lines().collect(Collectors.toList()));
} finally {
br.close();
}
}
@Test
@Order(1)
public void createSimRelTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createSimRels_parameters.json", SparkCreateSimRels.class));
parser.parseArgument(
new String[] {
"-e", entitiesPath,
"-w", workingPath,
"-np", numPartitions,
"-dc", dedupConfPath,
"-ut", "true"
});
new SparkCreateSimRels(
parser,
spark
).run();
long simrels_number = spark.read().load(simRelsPath).count();
System.out.println("simrels_number = " + simrels_number);
}
@Test
@Order(2)
public void createMergeRelTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createMergeRels_parameters.json", SparkCreateMergeRels.class));
parser.parseArgument(
new String[] {
"-e", entitiesPath,
"-w", workingPath,
"-np", numPartitions,
"-dc", dedupConfPath
});
new SparkCreateMergeRels(
parser,
spark
).run();
}
@Test
@Order(3)
public void createDedupEntityTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(Utility.readResource("/eu/dnetlib/pace/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class));
parser.parseArgument(
new String[] {
"-e", entitiesPath,
"-w", workingPath,
"-np", numPartitions,
"-dc", dedupConfPath
});
new SparkCreateDedupEntity(
parser,
spark
).run();
}
@Test //full deduplication workflow test
@Disabled
public void deduplicationTest() throws Exception {
//custom parameters for this test
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI()).toFile().getAbsolutePath()
));
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/software.dump.2000.json").toURI()).toFile().getAbsolutePath();
String simRelsPath = workingPath + "/simrels";
String mergeRelsPath = workingPath + "/mergerels";
String outputPath = workingPath + "/dedup";
long before_simrels = System.currentTimeMillis();
Deduper.createSimRels(
dedupConfig,
spark,
inputPath,
simRelsPath,
true,
false
);
long simrels_time = System.currentTimeMillis() - before_simrels;
long simrels_number = spark.read().load(simRelsPath).count();
long before_mergerels = System.currentTimeMillis();
Deduper.createMergeRels(
dedupConfig,
inputPath,
mergeRelsPath,
simRelsPath,
spark
);
long mergerels_time = System.currentTimeMillis() - before_mergerels;
long mergerels_number = spark.read().load(mergeRelsPath).count();
long before_dedupentity = System.currentTimeMillis();
Deduper.createDedupEntity(
dedupConfig,
mergeRelsPath,
inputPath,
spark,
outputPath
);
long dedupentity_time = System.currentTimeMillis() - before_dedupentity;
long dedupentity_number = context.textFile(outputPath).count();
System.out.println("Number of simrels : " + simrels_number);
System.out.println("Number of mergerels : " + mergerels_number);
System.out.println("Number of dedupentities : " + dedupentity_number);
System.out.println("Total time for simrels creation : " + simrels_time);
System.out.println("Total time for mergerels creation : " + mergerels_time);
System.out.println("Total time for dedupentity creation : " + dedupentity_time);
// FileUtils.deleteDirectory(new File(workingPath));
}
@Test //test the match between two JSON
@Disabled
public void matchTest() throws Exception {
String json1 = "{\"author\":[{\"affiliation\":[],\"fullname\":\"Hanayik, Taylor\",\"name\":\"Taylor\",\"pid\":[],\"rank\":1,\"surname\":\"Hanayik\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Drake, Christopher\",\"name\":\"Christopher\",\"pid\":[],\"rank\":2,\"surname\":\"Drake\"},{\"affiliation\":[],\"fullname\":\"Rorden, Chris\",\"name\":\"Chris\",\"pid\":[],\"rank\":3,\"surname\":\"Rorden\"},{\"affiliation\":[],\"fullname\":\"Hardcastle, Nell\",\"name\":\"Nell\",\"pid\":[],\"rank\":4,\"surname\":\"Hardcastle\"},{\"affiliation\":[],\"fullname\":\"Androulakis, Anthony\",\"name\":\"Anthony\",\"pid\":[],\"rank\":5,\"surname\":\"Androulakis\"}],\"bestaccessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":[{\"key\":\"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254\",\"value\":\"Datacite\"}],\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"dateofacceptance\":{\"value\":\"2022-03-02\"},\"dateofcollection\":\"2022-03-02T11:25:20+0000\",\"dateoftransformation\":\"2022-03-02T11:25:20+0000\",\"description\":[{\"value\":\"a WebGL2 based NIFTI volume viewer\"}],\"id\":\"50|doi_________::b596bfb411bbc62b902aedb11d0088d8\",\"instance\":[{\"accessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":{\"key\":\"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254\",\"value\":\"Datacite\"},\"dateofacceptance\":{\"value\":\"2022-03-02\"},\"hostedby\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"instancetype\":{\"classid\":\"0029\",\"classname\":\"Software\",\"schemeid\":\"dnet:publication_resource\",\"schemename\":\"dnet:publication_resource\"},\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786269\"}],\"refereed\":{\"classid\":\"0000\",\"classname\":\"Unknown\",\"schemeid\":\"dnet:review_levels\",\"schemename\":\"dnet:review_levels\"},\"url\":[\"https://dx.doi.org/10.5281/zenodo.5786269\"]}],\"language\":{\"classid\":\"und\",\"classname\":\"Undetermined\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"originalId\":[\"10.5281/zenodo.5786269\"],\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786269\"}],\"publisher\":{\"value\":\"Zenodo\"},\"relevantdate\":[{\"qualifier\":{\"classid\":\"issued\",\"classname\":\"issued\",\"schemeid\":\"dnet:dataCite_date\",\"schemename\":\"dnet:dataCite_date\"},\"value\":\"2022-03-02\"}],\"resourcetype\":{\"classid\":\"UNKNOWN\",\"classname\":\"Unknown\",\"schemeid\":\"dnet:dataCite_resource\",\"schemename\":\"dnet:dataCite_resource\"},\"resulttype\":{\"classid\":\"software\",\"classname\":\"software\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"subject\":[],\"title\":[{\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"value\":\"niivue/niivue: 0.21.1\"}]}";
String json2 = "{\"author\":[{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Chris Rorden\",\"name\":\"\",\"pid\":[],\"rank\":1,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of Oxford\"}],\"fullname\":\"Taylor Hanayik\",\"name\":\"\",\"pid\":[],\"rank\":2,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Christopher Drake\",\"name\":\"\",\"pid\":[],\"rank\":3,\"surname\":\"\"},{\"affiliation\":[],\"fullname\":\"Nell Hardcastle\",\"name\":\"\",\"pid\":[],\"rank\":4,\"surname\":\"\"},{\"affiliation\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"University of South Carolina\"}],\"fullname\":\"Roger Newman-Norlund\",\"name\":\"\",\"pid\":[],\"rank\":5,\"surname\":\"\"}],\"bestaccessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"collectedfrom\":[{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"}],\"context\":[],\"contributor\":[],\"country\":[],\"coverage\":[],\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"dateofacceptance\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"2021-12-16\"},\"dateofcollection\":\"2022-11-11T00:19:05+0000\",\"dateoftransformation\":\"2022-11-11T07:32:42.689Z\",\"description\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"<p>a WebGL2 based NIFTI volume viewer</p>\"}],\"documentationUrl\":[],\"eoscifguidelines\":[],\"externalReference\":[],\"extraInfo\":[],\"format\":[],\"fulltext\":[],\"id\":\"50|doi_________::59a7f397515febc2c7017fd2f866a777\",\"instance\":[{\"accessright\":{\"classid\":\"OPEN\",\"classname\":\"Open Access\",\"schemeid\":\"dnet:access_modes\",\"schemename\":\"dnet:access_modes\"},\"alternateIdentifier\":[],\"collectedfrom\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"dateofacceptance\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"2021-12-16\"},\"hostedby\":{\"key\":\"10|opendoar____::358aee4cc897452c00244351e4d91f69\",\"value\":\"ZENODO\"},\"instancetype\":{\"classid\":\"0029\",\"classname\":\"Software\",\"schemeid\":\"dnet:publication_resource\",\"schemename\":\"dnet:publication_resource\"},\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786270\"}],\"refereed\":{\"classid\":\"0000\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:review_levels\",\"schemename\":\"dnet:review_levels\"},\"url\":[\"http://dx.doi.org/10.5281/zenodo.5786270\",\"https://doi.org/10.5281/zenodo.5786270\"]}],\"language\":{\"classid\":\"UNKNOWN\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:languages\",\"schemename\":\"dnet:languages\"},\"lastupdatetimestamp\":1668556279928,\"license\":[],\"originalId\":[\"oai:zenodo.org:5786270\",\"50|od______2659::c8a3ff31e2ff537ebac22cdc4d7c64b0\"],\"pid\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.5281/zenodo.5786270\"}],\"programmingLanguage\":{\"classid\":\"\",\"classname\":\"\",\"schemeid\":\"\",\"schemename\":\"\"},\"publisher\":{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"value\":\"Zenodo\"},\"relevantdate\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"issued\",\"classname\":\"issued\",\"schemeid\":\"dnet:dataCite_date\",\"schemename\":\"dnet:dataCite_date\"},\"value\":\"2021-12-16\"}],\"resourcetype\":{\"classid\":\"UNKNOWN\",\"classname\":\"UNKNOWN\",\"schemeid\":\"dnet:dataCite_resource\",\"schemename\":\"dnet:dataCite_resource\"},\"resulttype\":{\"classid\":\"software\",\"classname\":\"software\",\"schemeid\":\"dnet:result_typologies\",\"schemename\":\"dnet:result_typologies\"},\"source\":[],\"subject\":[],\"title\":[{\"dataInfo\":{\"deletedbyinference\":false,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"main title\",\"classname\":\"main title\",\"schemeid\":\"dnet:dataCite_title\",\"schemename\":\"dnet:dataCite_title\"},\"value\":\"niivue/niivue: 0.13.0\"}]}";
DedupConfig config = DedupConfig.load(readFileFromHDFS(Paths
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI())
.toFile()
.getAbsolutePath()));
MapDocument a = MapDocumentUtil.asMapDocumentWithJPath(config, json1);
MapDocument b = MapDocumentUtil.asMapDocumentWithJPath(config, json2);
boolean result = new TreeProcessor(config).compare(a,b);
System.out.println("Tree Processor Result = " + result);
}
@Test //test the dedup of a group of JSON
@Disabled
public void dedupTest() throws Exception {
final String entitiesPath = Paths
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/software.to.fix.json").toURI())
.toFile()
.getAbsolutePath();
DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(Paths
.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/sw.tree.conf.json").toURI())
.toFile()
.getAbsolutePath()));
JavaPairRDD<String, MapDocument> mapDocuments = context
.textFile(entitiesPath)
.mapToPair(
(PairFunction<String, String, MapDocument>) s -> {
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
return new Tuple2<>(d.getIdentifier(), d);
})
.reduceByKey((a,b) -> a);
// create blocks for deduplication
JavaPairRDD<String, Block> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf);
for (Tuple2<String, Block> b : blocks.collect()) {
System.out.println("*******GROUPS********");
System.out.println("key = " + b._1());
System.out.println("elements = " + b._2().elements());
System.out.println("items = " + b._2().getDocuments().stream().map(d -> d.getIdentifier()).collect(Collectors.joining(",")));
System.out.println("*********************");
}
// create relations by comparing only elements in the same group
JavaRDD<Relation> relations = Deduper.computeRelations(context, blocks, dedupConf, true, false);
for (Relation r: relations.collect()) {
System.out.println("*******RELATIONS*******");
System.out.println("source = " + r.getSource());
System.out.println("target = " + r.getTarget());
System.out.println("***********************");
}
//vertexes
List<String> vertexes = mapDocuments.map(doc -> doc._1()).collect();
//edges
List<Tuple2<String, String>> edges = new ArrayList<>();
relations.collect().stream().forEach(r -> edges.add(new Tuple2(r.getSource(), r.getTarget())));
showGraph(vertexes, edges, mapDocuments);
cleanup();
}
public void showGraph(List<String> vertexes, List<Tuple2<String, String>> edges, JavaPairRDD<String, MapDocument> mapDocuments) {
try {
prepareGraphParams(
vertexes,
edges,
"/tmp/graph.html", Paths.get(DedupLocalTest.class.getResource("/graph_visualization_tool/graph_template.html").toURI()).toFile().getAbsolutePath(),
mapDocuments.collectAsMap());
Desktop.getDesktop().browse(new File("/tmp/graph.html").toURI());
} catch (Exception e) {
e.printStackTrace();
}
}
public int nodeDegree(String id, List<Tuple2<String, String>> edges) {
return (int) edges.stream().map(e -> e._1()).filter(s -> s.equalsIgnoreCase(id)).count();
}
public int minDegree(List<String> vertexes, List<Tuple2<String, String>> edges) {
int minDegree = 100;
for (String vertex: vertexes) {
int deg = nodeDegree(vertex, edges);
if (deg < minDegree)
minDegree = deg;
}
return minDegree;
}
@Test
@Disabled
public void asMapDocument() throws Exception {
final String json = "{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": true, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"resourcetype\": {\"classid\": \"0001\", \"classname\": \"0001\", \"schemeid\": \"dnet:dataCite_resource\", \"schemename\": \"dnet:dataCite_resource\"}, \"pid\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"handle\", \"classname\": \"Handle\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"11370/8fb3e34b-47c4-4e87-a675-c889db060e19\"}], \"contributor\": [], \"bestaccessright\": {\"classid\": \"CLOSED\", \"classname\": \"Closed Access\", \"schemeid\": \"dnet:access_modes\", \"schemename\": \"dnet:access_modes\"}, \"relevantdate\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"issued\", \"classname\": \"issued\", \"schemeid\": \"dnet:dataCite_date\", \"schemename\": \"dnet:dataCite_date\"}, \"value\": \"2018-01-01\"}], \"collectedfrom\": [{\"dataInfo\": null, \"key\": \"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f\", \"value\": \"DANS (Data Archiving and Networked Services)\"}], \"id\": \"50|DansKnawCris::4ad8a5701b7d06b851966e1b323a2a95\", \"subject\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"OF-THE-ART\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"SCHOOL LEADERSHIP\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"DYNAMIC-MODEL\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"IMPLEMENTATION\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"OUTCOMES\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"keyword\", \"classname\": \"keyword\", \"schemeid\": \"dnet:subject_classification_typologies\", \"schemename\": \"dnet:subject_classification_typologies\"}, \"value\": \"To be checked by Faculty\"}], \"embargoenddate\": null, \"lastupdatetimestamp\": 1645007805304, \"author\": [{\"surname\": \"Kyriakides\", \"name\": \"Leonidas\", \"pid\": [], \"rank\": 1, \"affiliation\": [], \"fullname\": \"Kyriakides, Leonidas\"}, {\"surname\": \"Georgiou\", \"name\": \"Maria P.\", \"pid\": [], \"rank\": 2, \"affiliation\": [], \"fullname\": \"Georgiou, Maria P.\"}, {\"surname\": \"Creemers\", \"name\": \"Bert P. M.\", \"pid\": [], \"rank\": 3, \"affiliation\": [], \"fullname\": \"Creemers, Bert P. M.\"}, {\"surname\": \"Panayiotou\", \"name\": \"Anastasia\", \"pid\": [], \"rank\": 4, \"affiliation\": [], \"fullname\": \"Panayiotou, Anastasia\"}, {\"surname\": \"Reynolds\", \"name\": \"David\", \"pid\": [], \"rank\": 5, \"affiliation\": [], \"fullname\": \"Reynolds, David\"}], \"instance\": [{\"refereed\": {\"classid\": \"0000\", \"classname\": \"UNKNOWN\", \"schemeid\": \"dnet:review_levels\", \"schemename\": \"dnet:review_levels\"}, \"hostedby\": {\"dataInfo\": null, \"key\": \"10|issn___print::2763d4e6e2e870a7ffd4bf8863c6bbff\", \"value\": \"School Effectiveness and School Improvement\"}, \"accessright\": {\"classid\": \"CLOSED\", \"classname\": \"Closed Access\", \"schemeid\": \"dnet:access_modes\", \"schemename\": \"dnet:access_modes\", \"openAccessRoute\": null}, \"license\": null, \"url\": [\"\", \"http://dx.doi.org/10.1080/09243453.2017.1398761\"], \"measures\": null, \"pid\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"handle\", \"classname\": \"Handle\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"11370/8fb3e34b-47c4-4e87-a675-c889db060e19\"}], \"distributionlocation\": null, \"processingchargecurrency\": null, \"alternateIdentifier\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"urn\", \"classname\": \"urn\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"urn:nbn:nl:ui:11-8fb3e34b-47c4-4e87-a675-c889db060e19\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"doi\", \"classname\": \"Digital Object Identifier\", \"schemeid\": \"dnet:pid_types\", \"schemename\": \"dnet:pid_types\"}, \"value\": \"10.1080/09243453.2017.1398761\"}], \"dateofacceptance\": {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"value\": \"2018-01-01\"}, \"collectedfrom\": {\"dataInfo\": null, \"key\": \"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f\", \"value\": \"DANS (Data Archiving and Networked Services)\"}, \"processingchargeamount\": null, \"instancetype\": {\"classid\": \"0001\", \"classname\": \"Article\", \"schemeid\": \"dnet:publication_resource\", \"schemename\": \"dnet:publication_resource\"}}], \"resulttype\": {\"classid\": \"publication\", \"classname\": \"publication\", \"schemeid\": \"dnet:result_typologies\", \"schemename\": \"dnet:result_typologies\"}, \"dateofcollection\": \"2020-11-12T23:04:09.482Z\", \"fulltext\": [], \"dateoftransformation\": \"2020-11-13T01:33:00.881Z\", \"description\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"value\": \"This paper investigates the impact of national policies for improving teaching and the school learning environment (SLE) on student achievement. In each participating country (i.e., Belgium/Flanders, Cyprus, Germany, Greece, Ireland, and Slovenia), a sample of at least 50 schools was drawn and tests in mathematics and science were administered to all Grade 4 students (N = 10,742) at the beginning and end of school year 2010\\u20132011. National policies were measured through (a) content analysis of policy documents, (b) interviews with policymakers, and (c) head-teacher questionnaires. Multilevel analyses revealed that most aspects of national policies for teaching and SLE were associated with student achievement in each subject irrespective of the source of data used to measure them. Implications are, finally, drawn.\"}], \"format\": [], \"processingchargecurrency\": null, \"journal\": {\"issnPrinted\": \"0924-3453\", \"conferencedate\": null, \"vol\": \"29\", \"conferenceplace\": null, \"name\": \"School Effectiveness and School Improvement\", \"iss\": \"2\", \"sp\": \"171\", \"edition\": \"\", \"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"issnOnline\": \"\", \"ep\": \"203\", \"issnLinking\": \"\"}, \"measures\": null, \"dateofacceptance\": {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"value\": \"2018-01-01\"}, \"coverage\": [], \"processingchargeamount\": null, \"externalReference\": [], \"publisher\": null, \"language\": {\"classid\": \"eng\", \"classname\": \"English\", \"schemeid\": \"dnet:languages\", \"schemename\": \"dnet:languages\"}, \"oaiprovenance\": {\"originDescription\": {\"metadataNamespace\": \"\", \"harvestDate\": \"2020-11-12T23:04:09.482Z\", \"baseURL\": \"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif\", \"datestamp\": \"2020-11-12T00:51:43Z\", \"altered\": true, \"identifier\": \"oai:services.nod.dans.knaw.nl:Publications/rug:oai:pure.rug.nl:publications/8fb3e34b-47c4-4e87-a675-c889db060e19\"}}, \"country\": [], \"extraInfo\": [], \"originalId\": [\"oai:services.nod.dans.knaw.nl:Publications/rug:oai:pure.rug.nl:publications/8fb3e34b-47c4-4e87-a675-c889db060e19\", \"50|DansKnawCris::4ad8a5701b7d06b851966e1b323a2a95\"], \"source\": [], \"context\": [], \"title\": [{\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"main title\", \"classname\": \"main title\", \"schemeid\": \"dnet:dataCite_title\", \"schemename\": \"dnet:dataCite_title\"}, \"value\": \"The impact of national educational policies on student achievement\"}, {\"dataInfo\": {\"provenanceaction\": {\"classid\": \"sysimport:crosswalk:datasetarchive\", \"classname\": \"Harvested\", \"schemeid\": \"dnet:provenanceActions\", \"schemename\": \"dnet:provenanceActions\"}, \"deletedbyinference\": false, \"inferred\": false, \"inferenceprovenance\": \"\", \"invisible\": false, \"trust\": \"0.9\"}, \"qualifier\": {\"classid\": \"subtitle\", \"classname\": \"subtitle\", \"schemeid\": \"dnet:dataCite_title\", \"schemename\": \"dnet:dataCite_title\"}, \"value\": \"a European study\"}]}";
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.new.tree.conf.json").toURI()).toFile().getAbsolutePath()
));
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, json);
for(String field: mapDocument.getFieldMap().keySet()) {
System.out.println(field + ": " + mapDocument.getFieldMap().get(field).stringValue());
}
}
@Test
@Disabled
public void noMatchTest() throws Exception {
//custom parameters for this test
DedupConfig dedupConfig = DedupConfig.load(readFileFromHDFS(
Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/config/pub.new.tree.conf.json").toURI()).toFile().getAbsolutePath()
));
String inputPath = Paths.get(DedupLocalTest.class.getResource("/eu/dnetlib/pace/examples/publications.dump.1000.json").toURI()).toFile().getAbsolutePath();
String simRelsPath = workingPath + "/simrels";
Deduper.createSimRels(
dedupConfig,
spark,
inputPath,
simRelsPath,
true,
true
);
Dataset<Relation> noMatches = spark.read().load(simRelsPath).as(Encoders.bean(Relation.class));
System.out.println("noMatches = " + noMatches.count());
noMatches.foreach((ForeachFunction<Relation>) r -> System.out.println(r.getSource() + " " + r.getTarget()));
FileUtils.deleteDirectory(new File(workingPath));
}
}