diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java index 5835617fb2..ec991cddc3 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java @@ -90,7 +90,7 @@ public class GroupEntitiesSparkJob { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); spark .read() - .textFile(toSeq(listPaths(inputPath, sc))) + .textFile(toSeq(listEntityPaths(inputPath, sc))) .map((MapFunction) s -> parseOaf(s), Encoders.kryo(OafEntity.class)) .filter((FilterFunction) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e))) .groupByKey((MapFunction) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING()) @@ -191,11 +191,11 @@ public class GroupEntitiesSparkJob { } } - private static List listPaths(String inputPath, JavaSparkContext sc) { + private static List listEntityPaths(String inputPath, JavaSparkContext sc) { return HdfsSupport .listFiles(inputPath, sc.hadoopConfiguration()) .stream() - .filter(f -> !f.equals("relation")) + .filter(f -> !f.toLowerCase().contains("relation")) .collect(Collectors.toList()); }