forked from D-Net/dnet-hadoop
GroupEntitiesSparkJob must read all graph paths but relations
This commit is contained in:
parent
76363a8512
commit
13eae4b31e
|
@ -90,7 +90,7 @@ public class GroupEntitiesSparkJob {
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
spark
|
spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(toSeq(listPaths(inputPath, sc)))
|
.textFile(toSeq(listEntityPaths(inputPath, sc)))
|
||||||
.map((MapFunction<String, OafEntity>) s -> parseOaf(s), Encoders.kryo(OafEntity.class))
|
.map((MapFunction<String, OafEntity>) s -> parseOaf(s), Encoders.kryo(OafEntity.class))
|
||||||
.filter((FilterFunction<OafEntity>) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e)))
|
.filter((FilterFunction<OafEntity>) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e)))
|
||||||
.groupByKey((MapFunction<OafEntity, String>) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING())
|
.groupByKey((MapFunction<OafEntity, String>) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING())
|
||||||
|
@ -191,11 +191,11 @@ public class GroupEntitiesSparkJob {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<String> listPaths(String inputPath, JavaSparkContext sc) {
|
private static List<String> listEntityPaths(String inputPath, JavaSparkContext sc) {
|
||||||
return HdfsSupport
|
return HdfsSupport
|
||||||
.listFiles(inputPath, sc.hadoopConfiguration())
|
.listFiles(inputPath, sc.hadoopConfiguration())
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.equals("relation"))
|
.filter(f -> !f.toLowerCase().contains("relation"))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue