From a52d5bde4f1336769260bea06602b5c954933c55 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 6 Nov 2019 17:45:52 +0100 Subject: [PATCH] simplified import procedure, maps the infospace as hive tables --- dhp-workflows/dhp-graph-mapper/pom.xml | 5 --- .../dhp/graph/SparkGraphImporterJob.java | 32 +++++++++---------- .../dhp/graph/input_graph_parameters.json | 3 +- .../dhp/graph/oozie_app/config-default.xml | 4 +-- .../dnetlib/dhp/graph/oozie_app/workflow.xml | 14 ++------ pom.xml | 6 ---- 6 files changed, 20 insertions(+), 44 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index c1abc68712..664de5ff17 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -32,11 +32,6 @@ 1.0.4-SNAPSHOT - - de.javakaffee - kryo-serializers - - diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java index ac1b0a860d..debe4436ad 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java @@ -3,11 +3,11 @@ package eu.dnetlib.dhp.graph; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.Text; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import scala.Tuple2; @@ -19,29 +19,27 @@ public class SparkGraphImporterJob { parser.parseArgument(args); final SparkSession spark = SparkSession .builder() - .appName("ImportGraph") + .appName(SparkGraphImporterJob.class.getSimpleName()) .master(parser.get("master")) + .config("hive.metastore.uris", parser.get("hive.metastore.uris")) + .enableHiveSupport() .getOrCreate(); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String inputPath = parser.get("sourcePath"); - final String outputPath = parser.get("targetPath"); - - final String filter = parser.get("filter"); // Read the input file and convert it into RDD of serializable object - final JavaRDD> inputRDD = sc.sequenceFile(inputPath, Text.class, Text.class) - .map(item -> new Tuple2<>(item._1.toString(), item._2.toString())); - GraphMappingUtils.types.forEach((name, clazz) -> { - if (StringUtils.isNotBlank(filter) || filter.toLowerCase().contains(name)) { - spark.createDataset(inputRDD - .filter(s -> s._1().equals(clazz.getName())) - .map(Tuple2::_2) - .map(s -> new ObjectMapper().readValue(s, clazz)) - .rdd(), Encoders.bean(clazz)) - .write() - .save(outputPath + "/" + name); - } + final JavaRDD> inputRDD = sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class) + .map(item -> new Tuple2<>(item._1.toString(), item._2.toString())); + spark.createDataset(inputRDD + .filter(s -> s._1().equals(clazz.getName())) + .map(Tuple2::_2) + .map(s -> new ObjectMapper().readValue(s, clazz)) + .rdd(), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable("openaire." + name); }); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json index ca7283b755..7f76c064fc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json @@ -1,6 +1,5 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, - {"paramName":"f", "paramLongName":"filter", "paramDescription": "csv of typology of dataframe to be generated", "paramRequired": false}, - {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path where store DataFrames on HDFS", "paramRequired": true} + {"paramName":"h", "paramLongName":"hive.metastore.uris","paramDescription": "the hive metastore uris", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml index b2b577712e..a5201a743a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml @@ -16,7 +16,7 @@ spark2 - filter - + hive.metastore.uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml index 8d90af1858..ed23690874 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -4,10 +4,6 @@ sourcePath the source path - - targetPath - the target path - sparkDriverMemory memory for driver process @@ -20,10 +16,6 @@ sparkExecutorCores number of cores used by single executor - - filter - csv string to filter the entities to generate - @@ -41,12 +33,10 @@ MapGraphIntoDataFrame eu.dnetlib.dhp.graph.SparkGraphImporterJob dhp-graph-mapper-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" -mt yarn-cluster --sourcePath${sourcePath} - --targetPath${targetPath} - --filter${filter} - + --hive.metastore.uris${hive.metastore.uris} diff --git a/pom.xml b/pom.xml index 1b746c0bac..29f4e94273 100644 --- a/pom.xml +++ b/pom.xml @@ -164,12 +164,6 @@ 1.1.6 - - de.javakaffee - kryo-serializers - 0.45 - - net.schmizz sshj