enrichment steps #38
|
@ -3,11 +3,16 @@ package eu.dnetlib.dhp.actionmanager.project;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -28,6 +33,15 @@ import eu.dnetlib.dhp.schema.oaf.Programme;
|
|||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.spark.rdd.SequenceFileRDDFunctions;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import scala.Function1;
|
||||
import scala.Tuple2;
|
||||
import scala.runtime.BoxedUnit;
|
||||
|
||||
public class SparkAtomicActionJob {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
@ -61,6 +75,8 @@ public class SparkAtomicActionJob {
|
|||
final String programmePath = parser.get("programmePath");
|
||||
log.info("programmePath {}: ", programmePath);
|
||||
|
||||
final String nameNode = parser.get("hdfsNameNode");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
|
@ -72,7 +88,8 @@ public class SparkAtomicActionJob {
|
|||
spark,
|
||||
projectPath,
|
||||
programmePath,
|
||||
outputPath);
|
||||
outputPath,
|
||||
nameNode);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -82,7 +99,8 @@ public class SparkAtomicActionJob {
|
|||
|
||||
private static void getAtomicActions(SparkSession spark, String projectPatH,
|
||||
String programmePath,
|
||||
String outputPath) {
|
||||
String outputPath,
|
||||
String nameNode) throws Exception{
|
||||
|
||||
Dataset<CSVProject> project = readPath(spark, projectPatH, CSVProject.class);
|
||||
Dataset<CSVProgramme> programme = readPath(spark, programmePath, CSVProgramme.class);
|
||||
|
@ -103,17 +121,16 @@ public class SparkAtomicActionJob {
|
|||
pm.setCode(csvProject.getProgramme());
|
||||
pm.setDescription(csvProgramme.get().getShortTitle());
|
||||
p.setProgramme(Arrays.asList(pm));
|
||||
return p;
|
||||
return new AtomicAction<>(Project.class, p);
|
||||
}
|
||||
|
||||
return null;
|
||||
}, Encoders.bean(Project.class))
|
||||
.filter(p -> !(p == null))
|
||||
// .map(p -> new AtomicAction<>(Project.class, p), Encoders.bean(AtomicAction.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
}, Encoders.bean(AtomicAction.class))
|
||||
.filter(aa -> !(aa == null))
|
||||
.toJavaRDD()
|
||||
.mapToPair(aa->new Tuple2<>(aa.getClazz().getCanonicalName(), OBJECT_MAPPER.writeValueAsString(aa)))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, null);
|
||||
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
|
|
|
@ -24,7 +24,6 @@
|
|||
<fs>
|
||||
<delete path='${outputPath}'/>
|
||||
<mkdir path='${outputPath}'/>
|
||||
<delete path="/tmp/h2020programme"/>
|
||||
</fs>
|
||||
<ok to="get_project_file"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue