forked from D-Net/dnet-hadoop
GroupEntitiesSparkJob must read all graph paths but relations
This commit is contained in:
parent
76363a8512
commit
13eae4b31e
|
@ -90,7 +90,7 @@ public class GroupEntitiesSparkJob {
|
|||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
spark
|
||||
.read()
|
||||
.textFile(toSeq(listPaths(inputPath, sc)))
|
||||
.textFile(toSeq(listEntityPaths(inputPath, sc)))
|
||||
.map((MapFunction<String, OafEntity>) s -> parseOaf(s), Encoders.kryo(OafEntity.class))
|
||||
.filter((FilterFunction<OafEntity>) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e)))
|
||||
.groupByKey((MapFunction<OafEntity, String>) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING())
|
||||
|
@ -191,11 +191,11 @@ public class GroupEntitiesSparkJob {
|
|||
}
|
||||
}
|
||||
|
||||
private static List<String> listPaths(String inputPath, JavaSparkContext sc) {
|
||||
private static List<String> listEntityPaths(String inputPath, JavaSparkContext sc) {
|
||||
return HdfsSupport
|
||||
.listFiles(inputPath, sc.hadoopConfiguration())
|
||||
.stream()
|
||||
.filter(f -> !f.equals("relation"))
|
||||
.filter(f -> !f.toLowerCase().contains("relation"))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue