package eu.dnetlib.dhp.oa.graph.dump.community; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.Optional; import java.util.stream.Collectors; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.model.community.CommunityResult; import eu.dnetlib.dhp.oa.model.community.Context; /** * This class splits the dumped results according to the research community - research initiative/infrastructure they * are related to. The information about the community is found in the element "context.id" in the result. Since the * context that can be found in the result can be associated not only to communities, a community Map is provided. It * will guide the splitting process. Note: the repartition(1) just before writing the results related to a community. * This is a choice due to uploading constraints (just one file for each community) As soon as a better solution will be * in place remove the repartition */ public class CommunitySplit implements Serializable { public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath) { SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath); CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath); execSplit(spark, inputPath, outputPath, communityMap); }); } private static void execSplit(SparkSession spark, String inputPath, String outputPath, CommunityMap communities) { Dataset result = Utils .readPath(spark, inputPath + "/publication", CommunityResult.class) .union(Utils.readPath(spark, inputPath + "/dataset", CommunityResult.class)) .union(Utils.readPath(spark, inputPath + "/orp", CommunityResult.class)) .union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class)); communities .keySet() .stream() .parallel() .forEach(c -> { result .filter( (FilterFunction) r -> Optional.ofNullable(r.getContext()).isPresent() && r.getContext().stream().anyMatch(con -> con.getCode().equals(c))) .map( (MapFunction) cr -> new ObjectMapper().writeValueAsString(cr), Encoders.STRING()) .write() .option("compression", "gzip") .mode(SaveMode.Overwrite) .text(outputPath + "/" + c.replace(" ", "_")); }); } }