added RDD based adjacency list creation procedure

This commit is contained in:
Claudio Atzori 2020-05-27 12:38:12 +02:00
parent f057dcdf65
commit 8047d16dd9
1 changed files with 39 additions and 1 deletions

View File

@ -9,14 +9,20 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
@ -83,7 +89,7 @@ public class AdjacencyListBuilderJob {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, outputPath);
createAdjacencyLists(spark, inputPath, outputPath); createAdjacencyListsRDD(spark, inputPath, outputPath);
}); });
} }
@ -118,6 +124,38 @@ public class AdjacencyListBuilderJob {
.parquet(outputPath); .parquet(outputPath);
} }
private static void createAdjacencyListsRDD(
SparkSession spark, String inputPath, String outputPath) {
log.info("Reading joined entities from: {}", inputPath);
RDD<JoinedEntity> joinedEntities = spark
.read()
.load(inputPath)
.as(Encoders.bean(EntityRelEntity.class))
.javaRDD()
.mapToPair(re -> {
JoinedEntity je = new JoinedEntity();
je.setEntity(re.getEntity());
je.setLinks(Lists.newArrayList());
if (re.getRelation() != null && re.getTarget() != null) {
je.getLinks().add(new Tuple2(re.getRelation(), re.getTarget()));
}
return new scala.Tuple2<>(re.getEntity().getId(), je);
})
.reduceByKey((je1, je2) -> {
je1.getLinks().addAll(je2.getLinks());
return je1;
})
.map(t -> t._2())
.rdd();
spark
.createDataset(joinedEntities, Encoders.bean(JoinedEntity.class))
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
}
private static void removeOutputDir(SparkSession spark, String path) { private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
} }