From aeb01fa353a028650c01ee417368154d9c45c220 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 17 Mar 2020 11:57:24 +0100 Subject: [PATCH] reading from newline delimited json textfiles instead of sequence files --- .../main/java/eu/dnetlib/dhp/graph/GraphJoiner.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java index 29492cb90..d260e0551 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -226,9 +226,8 @@ public class GraphJoiner implements Serializable { * @return the JavaPairRDD indexed by entity identifier */ private JavaPairRDD readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) { - return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class) - .mapToPair((PairFunction, String, TypedRow>) item -> { - final String s = item._2().toString(); + return sc.textFile(inputPath + "/" + type) + .mapToPair((PairFunction) s -> { final DocumentContext json = JsonPath.parse(s); final String id = json.read("$.id"); return new Tuple2<>(id, new TypedRow() @@ -247,9 +246,8 @@ public class GraphJoiner implements Serializable { * @return the JavaRDD containing all the relationships */ private JavaRDD readPathRelation(final JavaSparkContext sc, final String inputPath) { - return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) - .map(item -> { - final String s = item._2().toString(); + return sc.textFile(inputPath + "/relation") + .map(s -> { final DocumentContext json = JsonPath.parse(s); return new TypedRow() .setSourceId(json.read("$.source"))