package eu.dnetlib.doiboost.orcid; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; public class SparkPartitionLambdaFile { public static void main(String[] args) throws IOException, Exception { Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class); final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( SparkOrcidGenerateAuthors.class .getResourceAsStream( "/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json"))); parser.parseArgument(args); Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); final String workingPath = parser.get("workingPath"); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD lamdaFileRDD = sc.textFile(workingPath + "last_modified.csv"); lamdaFileRDD .repartition(20) .saveAsTextFile(workingPath.concat("lamdafiles")); }); } }