forked from D-Net/dnet-hadoop
changed to discriminate if dumping all the results type together or each one in its own archive
This commit is contained in:
parent
c5858afb88
commit
387be43fd4
|
@ -8,6 +8,7 @@ import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -20,7 +21,6 @@ import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads all the entities of the same type (Relation / Results) and saves them in the same folder
|
* Reads all the entities of the same type (Relation / Results) and saves them in the same folder
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class SparkCollectAndSave implements Serializable {
|
public class SparkCollectAndSave implements Serializable {
|
||||||
|
|
||||||
|
@ -48,6 +48,11 @@ public class SparkCollectAndSave implements Serializable {
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final Boolean aggregateResult = Optional
|
||||||
|
.ofNullable(parser.get("resultAggregation"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
|
@ -55,22 +60,42 @@ public class SparkCollectAndSave implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
Utils.removeOutputDir(spark, outputPath + "/result");
|
Utils.removeOutputDir(spark, outputPath + "/result");
|
||||||
run(spark, inputPath, outputPath);
|
run(spark, inputPath, outputPath, aggregateResult);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void run(SparkSession spark, String inputPath, String outputPath) {
|
private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) {
|
||||||
Utils
|
if (aggregate) {
|
||||||
.readPath(spark, inputPath + "/result/publication", Result.class)
|
Utils
|
||||||
.union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class))
|
.readPath(spark, inputPath + "/result/publication", Result.class)
|
||||||
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class))
|
.union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class))
|
||||||
.union(Utils.readPath(spark, inputPath + "/result/software", Result.class))
|
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class))
|
||||||
.write()
|
.union(Utils.readPath(spark, inputPath + "/result/software", Result.class))
|
||||||
.option("compression", "gzip")
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.option("compression", "gzip")
|
||||||
.json(outputPath + "/result");
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputPath + "/result");
|
||||||
|
} else {
|
||||||
|
write(
|
||||||
|
Utils
|
||||||
|
.readPath(spark, inputPath + "/result/publication", Result.class),
|
||||||
|
outputPath + "/publication");
|
||||||
|
write(
|
||||||
|
Utils
|
||||||
|
.readPath(spark, inputPath + "/result/dataset", Result.class),
|
||||||
|
outputPath + "/dataset");
|
||||||
|
write(
|
||||||
|
Utils
|
||||||
|
.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class),
|
||||||
|
outputPath + "/otheresearchproduct");
|
||||||
|
write(
|
||||||
|
Utils
|
||||||
|
.readPath(spark, inputPath + "/result/software", Result.class),
|
||||||
|
outputPath + "/software");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
Utils
|
Utils
|
||||||
.readPath(spark, inputPath + "/relation/publication", Relation.class)
|
.readPath(spark, inputPath + "/relation/publication", Relation.class)
|
||||||
|
@ -86,4 +111,12 @@ public class SparkCollectAndSave implements Serializable {
|
||||||
.json(outputPath + "/relation");
|
.json(outputPath + "/relation");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void write(Dataset<Result> dataSet, String outputPath) {
|
||||||
|
dataSet
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue