WIP action payload mapping into OAF type moved, (local) graph table name enum created, tests fixed

This commit is contained in:
Przemysław Jacewicz 2020-03-13 10:01:39 +01:00 committed by przemek
parent 5cc560c7e5
commit 8d9b3c5de2
2 changed files with 122 additions and 134 deletions

View File

@ -9,6 +9,7 @@ import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.*;
@ -25,10 +26,22 @@ import static org.apache.spark.sql.functions.*;
public class PromoteActionSetFromHDFSJob {
// TODO replace with project's common implementation
public enum GraphTableName {
DATASET, DATASOURCE, ORGANIZATION, OTHERRESEARCHPRODUCT, PROJECT, PUBLICATION, RELATION, SOFTWARE
}
private static final StructType KV_SCHEMA = StructType$.MODULE$.apply(
Arrays.asList(
StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
));
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(
String jsonConfiguration = IOUtils.toString(
PromoteActionSetFromHDFSJob.class
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/actionmanager_input_parameters.json")));
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/actionmanager_input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
@ -37,9 +50,9 @@ public class PromoteActionSetFromHDFSJob {
.orElse(Boolean.TRUE);
String inputGraphPath = parser.get("inputGraphPath");
String inputActionSetPaths = parser.get("inputActionSetPaths");
String graphTableName = parser.get("graphTableName");
GraphTableName graphTableName = GraphTableName.valueOf(parser.get("graphTableName").toUpperCase());
String outputGraphPath = parser.get("outputGraphPath");
OafMergeAndGet.Strategy strategy = OafMergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy"));
OafMergeAndGet.Strategy strategy = OafMergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
@ -75,78 +88,71 @@ public class PromoteActionSetFromHDFSJob {
SparkSession spark = null;
try {
spark = SparkSession.builder().config(conf).getOrCreate();
String inputGraphTablePath = String.format("%s/%s", inputGraphPath, graphTableName.name().toLowerCase());
String outputGraphTablePath = String.format("%s/%s", outputGraphPath, graphTableName.name().toLowerCase());
//TODO make graph table generic using enums
switch (graphTableName) {
case "dataset":
processWith(spark,
String.format("%s/%s", inputGraphPath, graphTableName),
case DATASET:
processGraphTable(spark,
inputGraphTablePath,
inputActionSetPaths,
outputGraphPath,
graphTableName,
outputGraphTablePath,
strategy,
eu.dnetlib.dhp.schema.oaf.Dataset.class);
break;
case "datasource":
processWith(spark,
String.format("%s/%s", inputGraphPath, graphTableName),
case DATASOURCE:
processGraphTable(spark,
inputGraphTablePath,
inputActionSetPaths,
outputGraphPath,
graphTableName,
outputGraphTablePath,
strategy,
Datasource.class);
break;
case "organization":
processWith(spark,
String.format("%s/%s", inputGraphPath, graphTableName),
case ORGANIZATION:
processGraphTable(spark,
inputGraphTablePath,
inputActionSetPaths,
outputGraphPath,
graphTableName,
outputGraphTablePath,
strategy,
Organization.class);
break;
case "otherresearchproduct":
processWith(spark,
String.format("%s/%s", inputGraphPath, graphTableName),
case OTHERRESEARCHPRODUCT:
processGraphTable(spark,
inputGraphTablePath,
inputActionSetPaths,
outputGraphPath,
graphTableName,
outputGraphTablePath,
strategy,
OtherResearchProduct.class);
break;
case "project":
processWith(spark,
String.format("%s/%s", inputGraphPath, graphTableName),
case PROJECT:
processGraphTable(spark,
inputGraphTablePath,
inputActionSetPaths,
outputGraphPath,
graphTableName,
outputGraphTablePath,
strategy,
Project.class);
break;
case "publication":
processWith(spark,
String.format("%s/%s", inputGraphPath, graphTableName),
case PUBLICATION:
processGraphTable(spark,
inputGraphTablePath,
inputActionSetPaths,
outputGraphPath,
graphTableName,
outputGraphTablePath,
strategy,
Publication.class);
break;
case "relation":
processWith(spark,
String.format("%s/%s", inputGraphPath, graphTableName),
case RELATION:
processGraphTable(spark,
inputGraphTablePath,
inputActionSetPaths,
outputGraphPath,
graphTableName,
outputGraphTablePath,
strategy,
Relation.class);
break;
case "software":
processWith(spark,
String.format("%s/%s", inputGraphPath, graphTableName),
case SOFTWARE:
processGraphTable(spark,
inputGraphTablePath,
inputActionSetPaths,
outputGraphPath,
graphTableName,
outputGraphTablePath,
strategy,
Software.class);
break;
@ -160,91 +166,74 @@ public class PromoteActionSetFromHDFSJob {
}
}
private static final StructType KV_SCHEMA = StructType$.MODULE$.apply(
Arrays.asList(
StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())
));
private static <T extends Oaf> void processGraphTable(SparkSession spark,
String inputGraphTablePath,
String inputActionSetPaths,
String outputGraphTablePath,
OafMergeAndGet.Strategy strategy,
Class<T> clazz) {
Dataset<T> tableDS = readGraphTable(spark, inputGraphTablePath, clazz)
.cache();
Dataset<T> actionPayloadDS = readActionSetPayloads(spark, inputActionSetPaths, clazz)
.cache();
private static <T> Dataset<T> readGraphTable(SparkSession spark, String path, Class<T> clazz) {
Dataset<T> result = promoteActionSetForGraphTable(tableDS, actionPayloadDS, strategy, clazz)
.map((MapFunction<T, T>) value -> value, Encoders.bean(clazz));
saveGraphTableAsParquet(result, outputGraphTablePath);
}
private static <T extends Oaf> Dataset<T> readGraphTable(SparkSession spark,
String inputGraphTablePath,
Class<T> clazz) {
JavaRDD<Row> rows = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.sequenceFile(path, Text.class, Text.class)
.sequenceFile(inputGraphTablePath, Text.class, Text.class)
.map(x -> RowFactory.create(x._1().toString(), x._2().toString()));
return spark.createDataFrame(rows, KV_SCHEMA)
.map((MapFunction<Row, T>) row -> new ObjectMapper().readValue(row.<String>getAs("value"), clazz),
Encoders.kryo(clazz));
Encoders.bean(clazz));
}
private static Dataset<String> readActionSetPayloads(SparkSession spark, String inputActionSetPaths) {
private static <T extends Oaf> Dataset<T> readActionSetPayloads(SparkSession spark,
String inputActionSetPaths,
Class<T> clazz) {
return Arrays
.stream(inputActionSetPaths.split(","))
.map(inputActionSetPath -> readActionSetPayload(spark, inputActionSetPath))
.map(inputActionSetPath -> readActionSetPayload(spark, inputActionSetPath, clazz))
.reduce(Dataset::union)
.get();
.orElseThrow(() -> new RuntimeException("error reading action sets: " + inputActionSetPaths));
}
private static Dataset<String> readActionSetPayload(SparkSession spark, String inputActionSetPath) {
private static <T extends Oaf> Dataset<T> readActionSetPayload(SparkSession spark,
String inputActionSetPath,
Class<T> clazz) {
JavaRDD<Row> actionsRDD = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.sequenceFile(inputActionSetPath, Text.class, Text.class)
.map(x -> RowFactory.create(x._1().toString(), x._2().toString()));
SerializableSupplier<BiFunction<String, Class<T>, T>> actionPayloadToOafFn = () -> (json, c) -> {
try {
return new ObjectMapper().disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES).readValue(json, c);
} catch (IOException e) {
return null;
}
};
return spark.createDataFrame(actionsRDD, KV_SCHEMA)
.select(unbase64(get_json_object(col("value"), "$.TargetValue"))
.cast(DataTypes.StringType).as("target_value_json"))
.as(Encoders.STRING());
.as(Encoders.STRING())
.map((MapFunction<String, T>) value -> actionPayloadToOafFn.get().apply(value, clazz), Encoders.bean(clazz))
.filter((FilterFunction<T>) Objects::nonNull);
}
private static <T extends Oaf> void processWith(SparkSession spark,
String inputGraphTablePath,
String inputActionSetPaths,
String outputGraphPath,
String graphTableName,
OafMergeAndGet.Strategy strategy,
Class<T> clazz) {
// System.out.println("===== tableDS =====");
Dataset<T> tableDS = readGraphTable(spark, inputGraphTablePath, clazz)
.cache();
// tableDS.printSchema();
// tableDS.show();
// tableDS.explain();
// System.out.println("DEBUG: tableDS.partitions=" + tableDS.rdd().getNumPartitions());
// System.out.println("===== actionPayloadDS =====");
Dataset<String> actionPayloadDS = readActionSetPayloads(spark, inputActionSetPaths)
.cache();
// actionPayloadDS.printSchema();
// actionPayloadDS.show();
// actionPayloadDS.explain();
// System.out.println("DEBUG: actionPayloadDS.partitions=" + actionPayloadDS.rdd().getNumPartitions());
// System.out.println("===== processed =====");
Dataset<T> processed = processGraphTable(tableDS, actionPayloadDS, strategy, clazz);
// processed.printSchema();
// processed.show();
// processed.explain();
// System.out.println("DEBUG: processed.partitions=" + processed.rdd().getNumPartitions());
// System.out.println("===== result =====");
Dataset<T> result = processed
.map((MapFunction<T, T>) value -> value, Encoders.bean(clazz));
// result.printSchema();
// result.show();
// result.explain();
// System.out.println("DEBUG: result.partitions=" + result.rdd().getNumPartitions());
String outputGraphTablePath = String.format("%s/%s", outputGraphPath, graphTableName);
result.write()
.format("parquet")
.save(outputGraphTablePath);
}
private static <T extends Oaf> Dataset<T> processGraphTable(Dataset<T> tableDS,
Dataset<String> actionPayloadDS,
OafMergeAndGet.Strategy strategy,
Class<T> clazz) {
private static <T extends Oaf> Dataset<T> promoteActionSetForGraphTable(Dataset<T> tableDS,
Dataset<T> actionPayloadDS,
OafMergeAndGet.Strategy strategy,
Class<T> clazz) {
SerializableSupplier<Function<T, String>> oafIdFn = () -> x -> {
if (x instanceof Relation) {
Relation r = (Relation) x;
@ -274,13 +263,6 @@ public class PromoteActionSetFromHDFSJob {
tableDS,
actionPayloadDS,
oafIdFn,
() -> (json, c) -> {
try {
return new ObjectMapper().disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES).readValue(json, clazz);
} catch (IOException e) {
return null;
}
},
mergeAndGetFn,
clazz);
@ -292,4 +274,10 @@ public class PromoteActionSetFromHDFSJob {
clazz
);
}
private static <T extends Oaf> void saveGraphTableAsParquet(Dataset<T> result, String outputGraphTablePath) {
result.write()
.format("parquet")
.save(outputGraphTablePath);
}
}

View File

@ -109,67 +109,67 @@ public class PromoteActionSetFromHDFSJobTest {
@Test
public void shouldReadActionsFromHDFSAndPromoteThemForDatasetsUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor("dataset",
readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName.DATASET,
OafMergeAndGet.Strategy.MERGE_FROM_AND_GET,
eu.dnetlib.dhp.schema.oaf.Dataset.class);
}
@Test
public void shouldReadActionsFromHDFSAndPromoteThemForDatasourcesUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor("datasource",
readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName.DATASOURCE,
OafMergeAndGet.Strategy.MERGE_FROM_AND_GET,
Datasource.class);
}
@Test
public void shouldReadActionsFromHDFSAndPromoteThemForOrganizationUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor("organization",
public void shouldReadActionsFromHDFSAndPromoteThemForOrganizationsUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName.ORGANIZATION,
OafMergeAndGet.Strategy.MERGE_FROM_AND_GET,
Organization.class);
}
@Test
public void shouldReadActionsFromHDFSAndPromoteThemForOtherResearchProductUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor("otherresearchproduct",
public void shouldReadActionsFromHDFSAndPromoteThemForOtherResearchProductsUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName.OTHERRESEARCHPRODUCT,
OafMergeAndGet.Strategy.MERGE_FROM_AND_GET,
OtherResearchProduct.class);
}
@Test
public void shouldReadActionsFromHDFSAndPromoteThemForProjectUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor("project",
public void shouldReadActionsFromHDFSAndPromoteThemForProjectsUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName.PROJECT,
OafMergeAndGet.Strategy.MERGE_FROM_AND_GET,
Project.class);
}
@Test
public void shouldReadActionsFromHDFSAndPromoteThemForPublicationUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor("publication",
public void shouldReadActionsFromHDFSAndPromoteThemForPublicationsUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName.PUBLICATION,
OafMergeAndGet.Strategy.MERGE_FROM_AND_GET,
Publication.class);
}
@Test
public void shouldReadActionsFromHDFSAndPromoteThemForRelationUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor("relation",
public void shouldReadActionsFromHDFSAndPromoteThemForRelationsUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName.RELATION,
OafMergeAndGet.Strategy.MERGE_FROM_AND_GET,
Relation.class);
}
@Test
public void shouldReadActionsFromHDFSAndPromoteThemForSoftwareUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor("software",
public void shouldReadActionsFromHDFSAndPromoteThemForSoftwaresUsingMergeFromStrategy() throws Exception {
readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName.SOFTWARE,
OafMergeAndGet.Strategy.MERGE_FROM_AND_GET,
Software.class);
}
private <T extends Oaf> void readActionsFromHDFSAndPromoteThemFor(String graphTableName,
private <T extends Oaf> void readActionsFromHDFSAndPromoteThemFor(PromoteActionSetFromHDFSJob.GraphTableName graphTableName,
OafMergeAndGet.Strategy strategy,
Class<T> clazz) throws Exception {
// given
String inputGraphTableJsonDumpPath =
String.format("%s/%s.json", "eu/dnetlib/dhp/actionmanager/input/graph", graphTableName);
createGraphTableFor(inputGraphTableJsonDumpPath, graphTableName, clazz);
String.format("%s/%s.json", "eu/dnetlib/dhp/actionmanager/input/graph", graphTableName.name().toLowerCase());
createGraphTableFor(inputGraphTableJsonDumpPath, graphTableName.name().toLowerCase(), clazz);
String inputActionSetPaths = createActionSets();
Path outputGraphDir = outputDir.resolve("graph");
@ -178,13 +178,13 @@ public class PromoteActionSetFromHDFSJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-inputGraphPath", inputGraphDir.toString(),
"-inputActionSetPaths", inputActionSetPaths,
"-graphTableName", graphTableName,
"-graphTableName", graphTableName.name(),
"-outputGraphPath", outputGraphDir.toString(),
"mergeAndGetStrategy", strategy.name()
"-mergeAndGetStrategy", strategy.name()
});
// then
Path outputGraphTableDir = outputGraphDir.resolve(graphTableName);
Path outputGraphTableDir = outputGraphDir.resolve(graphTableName.name().toLowerCase());
assertTrue(Files.exists(outputGraphDir));
List<T> outputGraphTableRows = readGraphTableFromParquet(outputGraphTableDir.toString(), clazz).collectAsList();
@ -193,7 +193,7 @@ public class PromoteActionSetFromHDFSJobTest {
assertEquals(10, outputGraphTableRows.size());
String expectedOutputGraphTableJsonDumpPath =
String.format("%s/%s/%s.json", "eu/dnetlib/dhp/actionmanager/output/graph", strategy.name().toLowerCase(), graphTableName);
String.format("%s/%s/%s.json", "eu/dnetlib/dhp/actionmanager/output/graph", strategy.name().toLowerCase(), graphTableName.name().toLowerCase());
Path expectedOutputGraphTableJsonDumpFile = Paths
.get(Objects.requireNonNull(cl.getResource(expectedOutputGraphTableJsonDumpPath)).getFile());
List<T> expectedOutputGraphTableRows = readGraphTableFromJSON(