From 13eae4b31e79b6945b95d06954149d7b31463b76 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 26 Nov 2020 11:04:01 +0100 Subject: [PATCH] GroupEntitiesSparkJob must read all graph paths but relations --- .../java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java index 5835617fb..ec991cddc 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java @@ -90,7 +90,7 @@ public class GroupEntitiesSparkJob { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); spark .read() - .textFile(toSeq(listPaths(inputPath, sc))) + .textFile(toSeq(listEntityPaths(inputPath, sc))) .map((MapFunction) s -> parseOaf(s), Encoders.kryo(OafEntity.class)) .filter((FilterFunction) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e))) .groupByKey((MapFunction) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING()) @@ -191,11 +191,11 @@ public class GroupEntitiesSparkJob { } } - private static List listPaths(String inputPath, JavaSparkContext sc) { + private static List listEntityPaths(String inputPath, JavaSparkContext sc) { return HdfsSupport .listFiles(inputPath, sc.hadoopConfiguration()) .stream() - .filter(f -> !f.equals("relation")) + .filter(f -> !f.toLowerCase().contains("relation")) .collect(Collectors.toList()); }