reading from newline delimited json textfiles instead of sequence files

This commit is contained in:
Claudio Atzori 2020-03-17 11:57:24 +01:00
parent af835f2f98
commit aeb01fa353
1 changed files with 4 additions and 6 deletions

View File

@ -226,9 +226,8 @@ public class GraphJoiner implements Serializable {
* @return the JavaPairRDD<String, TypedRow> indexed by entity identifier * @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
*/ */
private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) { private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class) return sc.textFile(inputPath + "/" + type)
.mapToPair((PairFunction<Tuple2<Text, Text>, String, TypedRow>) item -> { .mapToPair((PairFunction<String, String, TypedRow>) s -> {
final String s = item._2().toString();
final DocumentContext json = JsonPath.parse(s); final DocumentContext json = JsonPath.parse(s);
final String id = json.read("$.id"); final String id = json.read("$.id");
return new Tuple2<>(id, new TypedRow() return new Tuple2<>(id, new TypedRow()
@ -247,9 +246,8 @@ public class GraphJoiner implements Serializable {
* @return the JavaRDD<TypedRow> containing all the relationships * @return the JavaRDD<TypedRow> containing all the relationships
*/ */
private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) { private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) {
return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class) return sc.textFile(inputPath + "/relation")
.map(item -> { .map(s -> {
final String s = item._2().toString();
final DocumentContext json = JsonPath.parse(s); final DocumentContext json = JsonPath.parse(s);
return new TypedRow() return new TypedRow()
.setSourceId(json.read("$.source")) .setSourceId(json.read("$.source"))