diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml
index 802c3ff21..0b86dcdf1 100644
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@@ -19,6 +19,11 @@
org.apache.spark
spark-sql_2.11
+
+ org.apache.spark
+ spark-hive_2.11
+ test
+
eu.dnetlib.dhp
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java
index 95c3cd480..dbbb88b88 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java
@@ -18,29 +18,38 @@ public class SparkGraphImporterJob {
"/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
parser.parseArgument(args);
+ new SparkGraphImporterJob().run(parser);
+ }
+
+ private void run(ArgumentApplicationParser parser) {
try(SparkSession spark = getSparkSession(parser)) {
- final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String hiveDbName = parser.get("hive_db_name");
- spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName));
- spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
-
- // Read the input file and convert it into RDD of serializable object
- GraphMappingUtils.types.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name)
- .map(s -> new ObjectMapper().readValue(s, clazz))
- .rdd(), Encoders.bean(clazz))
- .write()
- .mode(SaveMode.Overwrite)
- .saveAsTable(hiveDbName + "." + name));
+ runWith(spark, inputPath, hiveDbName);
}
}
+ // public for testing
+ public void runWith(SparkSession spark, String inputPath, String hiveDbName) {
+
+ spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName));
+ spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
+
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+ // Read the input file and convert it into RDD of serializable object
+ GraphMappingUtils.types.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name)
+ .map(s -> new ObjectMapper().readValue(s, clazz))
+ .rdd(), Encoders.bean(clazz))
+ .write()
+ .mode(SaveMode.Overwrite)
+ .saveAsTable(hiveDbName + "." + name));
+ }
+
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
-
return SparkSession
.builder()
.appName(SparkGraphImporterJob.class.getSimpleName())
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json
index 86fca71f3..13c7abd51 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/input_graph_parameters.json
@@ -1,6 +1,6 @@
[
- {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
- {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
- {"paramName":"h", "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris", "paramRequired": true},
- {"paramName":"db", "paramLongName":"hive_db_name", "paramDescription": "the target hive database name", "paramRequired": true}
+ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
+ {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
+ {"paramName":"h", "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris", "paramRequired": true},
+ {"paramName":"db", "paramLongName":"hive_db_name", "paramDescription": "the target hive database name", "paramRequired": true}
]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml
index bbee2f01c..e63bbbbfb 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml
@@ -59,10 +59,10 @@
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
- -mt yarn-cluster
- --sourcePath${sourcePath}
- --hive_db_name${hive_db_name}
- --hive_metastore_uris${hive_metastore_uris}
+ -mt yarn
+ -s${sourcePath}
+ -db${hive_db_name}
+ -h${hive_metastore_uris}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java
index c7743d684..511adf3f1 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java
@@ -1,52 +1,54 @@
package eu.dnetlib.dhp.graph;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.Encoders;
+import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
-import scala.Tuple2;
import java.nio.file.Path;
-import java.util.List;
-import java.util.stream.Collectors;
public class SparkGraphImporterJobTest {
- private static final long MAX = 1000L;
+ private final static String TEST_DB_NAME = "test";
- @Disabled("must be parametrized to run locally")
- public void testImport(@TempDir Path outPath) throws Exception {
- SparkGraphImporterJob.main(new String[] {
- "-mt", "local[*]",
- "-s", getClass().getResource("/eu/dnetlib/dhp/graph/sample").getPath(),
- "-h", "",
- "-db", "test"
- });
+ @Test
+ public void testImport(@TempDir Path outPath) {
+ try(SparkSession spark = testSparkSession(outPath.toString())) {
- countEntities(outPath.toString()).forEach(t -> {
- System.out.println(t);
- Assertions.assertEquals(MAX, t._2().longValue(), String.format("mapped %s must be %s", t._1(), MAX));
- });
+ new SparkGraphImporterJob().runWith(
+ spark,
+ getClass().getResource("/eu/dnetlib/dhp/graph/sample").getPath(),
+ TEST_DB_NAME);
+
+ GraphMappingUtils.types.forEach((name, clazz) -> {
+ final long count = spark.read().table(TEST_DB_NAME + "." + name).count();
+ if (name.equals("relation")) {
+ Assertions.assertEquals(100, count, String.format("%s should be 100", name));
+ } else {
+ Assertions.assertEquals(10, count, String.format("%s should be 10", name));
+ }
+ });
+ }
}
- public static List> countEntities(final String inputPath) {
+ private SparkSession testSparkSession(final String inputPath) {
+ SparkConf conf = new SparkConf();
- final SparkSession spark = SparkSession
+ conf.set("spark.driver.host", "localhost");
+ conf.set("hive.metastore.local", "true");
+ conf.set("hive.metastore.warehouse.dir", inputPath + "/warehouse");
+ conf.set("spark.sql.warehouse.dir", inputPath);
+ conf.set("javax.jdo.option.ConnectionURL", String.format("jdbc:derby:;databaseName=%s/junit_metastore_db;create=true", inputPath));
+ conf.set("spark.ui.enabled", "false");
+
+ return SparkSession
.builder()
.appName(SparkGraphImporterJobTest.class.getSimpleName())
.master("local[*]")
+ .config(conf)
+ .enableHiveSupport()
.getOrCreate();
- //final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-
- return GraphMappingUtils.types.entrySet()
- .stream()
- .map(entry -> {
- final Long count = spark.read().load(inputPath + "/" + entry.getKey()).as(Encoders.bean(entry.getValue())).count();
- return new Tuple2(entry.getKey(), count);
- })
- .collect(Collectors.toList());
}
+
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz
index ce0b9709b..0da3c4071 100644
Binary files a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz differ
diff --git a/pom.xml b/pom.xml
index 6594943a8..861b7a2ea 100644
--- a/pom.xml
+++ b/pom.xml
@@ -143,6 +143,12 @@
${dhp.spark.version}
provided
+
+ org.apache.spark
+ spark-hive_2.11
+ ${dhp.spark.version}
+ test
+
org.slf4j