forked from D-Net/dnet-hadoop
[dhp-actionmanager] partitioning spark job added
This commit is contained in:
parent
f9f7350bb9
commit
e21bb89dbd
|
@ -0,0 +1,120 @@
|
|||
package eu.dnetlib.dhp.actionmanager.partition;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.types.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.spark.sql.functions.*;
|
||||
|
||||
/**
|
||||
* Partitions given set of action sets by payload type.
|
||||
*/
|
||||
public class PartitionActionSetsByPayloadTypeJob {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class);
|
||||
|
||||
private static final StructType KV_SCHEMA = StructType$.MODULE$.apply(
|
||||
Arrays.asList(
|
||||
StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
|
||||
StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
|
||||
));
|
||||
|
||||
private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$.apply(
|
||||
Arrays.asList(
|
||||
StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()),
|
||||
StructField$.MODULE$.apply("payload", DataTypes.StringType, false, Metadata.empty())
|
||||
));
|
||||
|
||||
private static final String INPUT_ACTION_SET_PATHS_SEPARATOR = ",";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils.toString(
|
||||
PromoteActionPayloadForGraphTableJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/partition/partition_action_sets_by_payload_type_input_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String inputActionSetPaths = parser.get("inputActionSetPaths");
|
||||
logger.info("inputActionSetPaths: {}", inputActionSetPaths);
|
||||
|
||||
String outputPath = parser.get("outputPath");
|
||||
logger.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
readAndWriteActionSetsFromPaths(spark,
|
||||
Arrays.asList(inputActionSetPaths.split(INPUT_ACTION_SET_PATHS_SEPARATOR)),
|
||||
outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
private static void readAndWriteActionSetsFromPaths(SparkSession spark,
|
||||
List<String> inputActionSetPaths,
|
||||
String outputPath) {
|
||||
inputActionSetPaths
|
||||
.forEach(inputActionSetPath -> {
|
||||
Dataset<Row> actionDS = readActionSetFromPath(spark, inputActionSetPath);
|
||||
saveActions(actionDS, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static Dataset<Row> readActionSetFromPath(SparkSession spark,
|
||||
String path) {
|
||||
logger.info("Reading actions from path: {}", path);
|
||||
|
||||
List<String> files = HdfsSupport.listFiles(path, spark.sparkContext().hadoopConfiguration());
|
||||
logger.info("Found files: {}", String.join(",", files));
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
return files
|
||||
.stream()
|
||||
.map(file -> {
|
||||
JavaRDD<Row> rdd = sc
|
||||
.sequenceFile(file, Text.class, Text.class)
|
||||
.map(x -> RowFactory.create(x._1().toString(), x._2().toString()));
|
||||
return spark.createDataFrame(rdd, KV_SCHEMA)
|
||||
.withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA))
|
||||
.select(expr("atomic_action.*"));
|
||||
})
|
||||
.reduce(spark.createDataFrame(Collections.emptyList(), ATOMIC_ACTION_SCHEMA), Dataset::union);
|
||||
}
|
||||
|
||||
private static void saveActions(Dataset<Row> actionDS,
|
||||
String path) {
|
||||
logger.info("Saving actions to path: {}", path);
|
||||
actionDS
|
||||
.write()
|
||||
.partitionBy("clazz")
|
||||
.mode(SaveMode.Append)
|
||||
.parquet(path);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "TODO",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "iasp",
|
||||
"paramLongName": "inputActionSetPaths",
|
||||
"paramDescription": "TODO",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "opasp",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "TODO",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,190 @@
|
|||
package eu.dnetlib.dhp.actionmanager.partition;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.types.*;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import scala.Tuple2;
|
||||
import scala.collection.mutable.Seq;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.common.ThrowingSupport.rethrowAsRuntimeException;
|
||||
import static org.apache.spark.sql.functions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
|
||||
import static scala.collection.JavaConversions.mutableSeqAsJavaList;
|
||||
|
||||
public class PartitionActionSetsByPayloadTypeJobTest {
|
||||
private static final ClassLoader cl = PartitionActionSetsByPayloadTypeJobTest.class.getClassLoader();
|
||||
|
||||
private static Configuration configuration;
|
||||
private static SparkSession spark;
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$.apply(
|
||||
Arrays.asList(
|
||||
StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()),
|
||||
StructField$.MODULE$.apply("payload", DataTypes.StringType, false, Metadata.empty())
|
||||
));
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
configuration = Job.getInstance().getConfiguration();
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName());
|
||||
conf.setMaster("local");
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() {
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@DisplayName("Job")
|
||||
@Nested
|
||||
class Main {
|
||||
|
||||
@Test
|
||||
public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception {
|
||||
// given
|
||||
Path inputActionSetsDir = workingDir.resolve("input").resolve("action_sets");
|
||||
Path outputDir = workingDir.resolve("output");
|
||||
|
||||
Map<String, List<String>> oafsByClassName = createActionSets(inputActionSetsDir);
|
||||
|
||||
// when
|
||||
PartitionActionSetsByPayloadTypeJob.main(new String[]{
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-inputActionSetPaths", inputActionSetsDir.toString(),
|
||||
"-outputPath", outputDir.toString()
|
||||
});
|
||||
|
||||
// then
|
||||
Files.exists(outputDir);
|
||||
|
||||
assertForOafType(outputDir, oafsByClassName, eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
assertForOafType(outputDir, oafsByClassName, Datasource.class);
|
||||
assertForOafType(outputDir, oafsByClassName, Organization.class);
|
||||
assertForOafType(outputDir, oafsByClassName, OtherResearchProduct.class);
|
||||
assertForOafType(outputDir, oafsByClassName, Project.class);
|
||||
assertForOafType(outputDir, oafsByClassName, Publication.class);
|
||||
assertForOafType(outputDir, oafsByClassName, Result.class);
|
||||
assertForOafType(outputDir, oafsByClassName, Relation.class);
|
||||
assertForOafType(outputDir, oafsByClassName, Software.class);
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, List<String>> createActionSets(Path inputActionSetsDir) throws IOException {
|
||||
Path inputActionSetJsonDumpsDir = Paths
|
||||
.get(Objects.requireNonNull(cl.getResource("eu/dnetlib/dhp/actionmanager/partition/input/"))
|
||||
.getFile());
|
||||
|
||||
Map<String, List<String>> oafsByType = new HashMap<>();
|
||||
Files
|
||||
.list(inputActionSetJsonDumpsDir)
|
||||
.forEach(inputActionSetJsonDumpFile -> {
|
||||
String inputActionSetId = inputActionSetJsonDumpFile.getFileName().toString();
|
||||
Path inputActionSetDir = inputActionSetsDir.resolve(inputActionSetId);
|
||||
|
||||
Dataset<String> actionDS = readActionsFromJsonDump(inputActionSetJsonDumpFile.toString())
|
||||
.cache();
|
||||
|
||||
writeActionsAsJobInput(actionDS, inputActionSetId, inputActionSetDir.toString());
|
||||
|
||||
Map<String, List<String>> actionSetOafsByType = actionDS
|
||||
.withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA))
|
||||
.select(expr("atomic_action.*"))
|
||||
.groupBy(col("clazz"))
|
||||
.agg(collect_list(col("payload")).as("payload_list"))
|
||||
.collectAsList()
|
||||
.stream()
|
||||
.map(row -> new AbstractMap.SimpleEntry<>(row.<String>getAs("clazz"),
|
||||
mutableSeqAsJavaList(row.<Seq<String>>getAs("payload_list"))))
|
||||
.collect(Collectors.toMap(AbstractMap.SimpleEntry::getKey, AbstractMap.SimpleEntry::getValue));
|
||||
|
||||
actionSetOafsByType.keySet()
|
||||
.forEach(x -> {
|
||||
if (oafsByType.containsKey(x)) {
|
||||
List<String> collected = new ArrayList<>();
|
||||
collected.addAll(oafsByType.get(x));
|
||||
collected.addAll(actionSetOafsByType.get(x));
|
||||
oafsByType.put(x, collected);
|
||||
} else {
|
||||
oafsByType.put(x, actionSetOafsByType.get(x));
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return oafsByType;
|
||||
}
|
||||
|
||||
private static Dataset<String> readActionsFromJsonDump(String path) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(path);
|
||||
}
|
||||
|
||||
private static void writeActionsAsJobInput(Dataset<String> actionDS,
|
||||
String inputActionSetId,
|
||||
String path) {
|
||||
actionDS
|
||||
.javaRDD()
|
||||
.mapToPair(json -> new Tuple2<>(new Text(inputActionSetId), new Text(json)))
|
||||
.saveAsNewAPIHadoopFile(path,
|
||||
Text.class,
|
||||
Text.class,
|
||||
SequenceFileOutputFormat.class,
|
||||
configuration);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> void assertForOafType(Path outputDir, Map<String, List<String>> oafsByClassName, Class<T> clazz) {
|
||||
Path outputDatasetDir = outputDir.resolve(String.format("clazz=%s", clazz.getCanonicalName()));
|
||||
Files.exists(outputDatasetDir);
|
||||
|
||||
List<T> actuals = readActionPayloadFromJobOutput(outputDatasetDir.toString(), clazz).collectAsList();
|
||||
actuals.sort(Comparator.comparingInt(Object::hashCode));
|
||||
|
||||
List<T> expecteds = oafsByClassName.get(clazz.getCanonicalName()).stream()
|
||||
.map(json -> mapToOaf(json, clazz))
|
||||
.sorted(Comparator.comparingInt(Object::hashCode))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
assertIterableEquals(expecteds, actuals);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> Dataset<T> readActionPayloadFromJobOutput(String path,
|
||||
Class<T> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.parquet(path)
|
||||
.map((MapFunction<Row, T>) value -> OBJECT_MAPPER.readValue(value.<String>getAs("payload"), clazz),
|
||||
Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
private static <T extends Oaf> T mapToOaf(String json, Class<T> clazz) {
|
||||
return rethrowAsRuntimeException(
|
||||
() -> OBJECT_MAPPER.readValue(json, clazz),
|
||||
String.format("failed to map json to class: json=%s, class=%s", json, clazz.getCanonicalName())
|
||||
);
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue