2021-02-01 13:56:05 +01:00
|
|
|
|
|
|
|
package eu.dnetlib.dhp.aggregation.common;
|
|
|
|
|
|
|
|
import java.io.BufferedOutputStream;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.nio.charset.StandardCharsets;
|
|
|
|
|
|
|
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
import org.apache.hadoop.fs.Path;
|
2021-02-01 19:29:10 +01:00
|
|
|
import org.apache.spark.sql.Dataset;
|
|
|
|
import org.apache.spark.sql.SaveMode;
|
2021-02-01 13:56:05 +01:00
|
|
|
import org.apache.spark.sql.SparkSession;
|
2021-02-01 19:29:10 +01:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
|
|
import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob;
|
|
|
|
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
2021-02-01 13:56:05 +01:00
|
|
|
|
|
|
|
public class AggregationUtility {
|
|
|
|
|
2021-02-01 19:29:10 +01:00
|
|
|
private static final Logger log = LoggerFactory.getLogger(AggregationUtility.class);
|
|
|
|
|
2021-02-01 13:56:05 +01:00
|
|
|
public static void writeTotalSizeOnHDFS(final SparkSession spark, final Long total, final String path)
|
|
|
|
throws IOException {
|
|
|
|
|
2021-02-01 19:29:10 +01:00
|
|
|
log.info("writing size ({}) info file {}", total, path);
|
|
|
|
try (FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
|
|
|
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
|
|
|
|
os.write(total.toString().getBytes(StandardCharsets.UTF_8));
|
|
|
|
os.flush();
|
|
|
|
}
|
2021-02-01 13:56:05 +01:00
|
|
|
|
2021-02-01 19:29:10 +01:00
|
|
|
}
|
2021-02-01 13:56:05 +01:00
|
|
|
|
2021-02-01 19:29:10 +01:00
|
|
|
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
|
|
|
|
log.info("saving dataset in: {}", targetPath);
|
|
|
|
mdstore
|
|
|
|
.write()
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
.format("parquet")
|
|
|
|
.save(targetPath);
|
2021-02-01 13:56:05 +01:00
|
|
|
}
|
2021-02-01 19:29:10 +01:00
|
|
|
|
2021-02-01 13:56:05 +01:00
|
|
|
}
|