unit test for GraphHiveImporterJob
This commit is contained in:
parent
ba9f07a6fe
commit
47f3d9b757
|
@ -29,6 +29,20 @@ public class SparkSessionSupport {
|
||||||
runWithSparkSession(c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn);
|
runWithSparkSession(c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a given function using SparkSession created with hive support and using default builder and supplied SparkConf.
|
||||||
|
* Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally.
|
||||||
|
*
|
||||||
|
* @param conf SparkConf instance
|
||||||
|
* @param isSparkSessionManaged When true will stop SparkSession
|
||||||
|
* @param fn Consumer to be applied to constructed SparkSession
|
||||||
|
*/
|
||||||
|
public static void runWithSparkHiveSession(SparkConf conf,
|
||||||
|
Boolean isSparkSessionManaged,
|
||||||
|
ThrowingConsumer<SparkSession, Exception> fn) {
|
||||||
|
runWithSparkSession(c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(), conf, isSparkSessionManaged, fn);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops SparkSession
|
* Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops SparkSession
|
||||||
* when SparkSession is managed. Allows to reuse SparkSession created externally.
|
* when SparkSession is managed. Allows to reuse SparkSession created externally.
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.dhp.actionmanager.promote;
|
package eu.dnetlib.dhp.actionmanager.promote;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -45,34 +46,7 @@ public class PromoteActionPayloadForGraphTableJobTest {
|
||||||
conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName());
|
conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName());
|
||||||
conf.setMaster("local");
|
conf.setMaster("local");
|
||||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
conf.registerKryoClasses(new Class[]{
|
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
||||||
Author.class,
|
|
||||||
Context.class,
|
|
||||||
Country.class,
|
|
||||||
DataInfo.class,
|
|
||||||
eu.dnetlib.dhp.schema.oaf.Dataset.class,
|
|
||||||
Datasource.class,
|
|
||||||
ExternalReference.class,
|
|
||||||
ExtraInfo.class,
|
|
||||||
Field.class,
|
|
||||||
GeoLocation.class,
|
|
||||||
Instance.class,
|
|
||||||
Journal.class,
|
|
||||||
KeyValue.class,
|
|
||||||
Oaf.class,
|
|
||||||
OafEntity.class,
|
|
||||||
OAIProvenance.class,
|
|
||||||
Organization.class,
|
|
||||||
OriginDescription.class,
|
|
||||||
OtherResearchProduct.class,
|
|
||||||
Project.class,
|
|
||||||
Publication.class,
|
|
||||||
Qualifier.class,
|
|
||||||
Relation.class,
|
|
||||||
Result.class,
|
|
||||||
Software.class,
|
|
||||||
StructuredProperty.class
|
|
||||||
});
|
|
||||||
spark = SparkSession.builder().config(conf).getOrCreate();
|
spark = SparkSession.builder().config(conf).getOrCreate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
package eu.dnetlib.dhp.oa.graph;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
public class GraphHiveImporterJob {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJob.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils.toString(GraphHiveImporterJob.class.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/input_graph_hive_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("inputPath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
String hiveMetastoreUris = parser.get("hiveMetastoreUris");
|
||||||
|
log.info("hiveMetastoreUris: {}", hiveMetastoreUris);
|
||||||
|
|
||||||
|
String hiveDbName = parser.get("hiveDbName");
|
||||||
|
log.info("hiveDbName: {}", hiveDbName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", hiveMetastoreUris);
|
||||||
|
|
||||||
|
runWithSparkHiveSession(conf, isSparkSessionManaged,
|
||||||
|
spark -> loadGraphAsHiveDB(spark, inputPath, hiveDbName));
|
||||||
|
}
|
||||||
|
|
||||||
|
// protected for testing
|
||||||
|
private static void loadGraphAsHiveDB(SparkSession spark, String inputPath, String hiveDbName) {
|
||||||
|
|
||||||
|
spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName));
|
||||||
|
spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
// Read the input file and convert it into RDD of serializable object
|
||||||
|
ModelSupport.oafTypes.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name)
|
||||||
|
.map(s -> new ObjectMapper().readValue(s, clazz))
|
||||||
|
.rdd(), Encoders.bean(clazz))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.saveAsTable(hiveDbName + "." + name));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,62 +0,0 @@
|
||||||
package eu.dnetlib.dhp.oa.graph;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
|
|
||||||
public class SparkGraphImporterJob {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
||||||
IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_graph_parameters.json")));
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
new SparkGraphImporterJob().run(parser);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void run(ArgumentApplicationParser parser) {
|
|
||||||
try(SparkSession spark = getSparkSession(parser)) {
|
|
||||||
|
|
||||||
final String inputPath = parser.get("sourcePath");
|
|
||||||
final String hiveDbName = parser.get("hive_db_name");
|
|
||||||
|
|
||||||
runWith(spark, inputPath, hiveDbName);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// protected for testing
|
|
||||||
protected void runWith(SparkSession spark, String inputPath, String hiveDbName) {
|
|
||||||
|
|
||||||
spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName));
|
|
||||||
spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
|
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
|
||||||
// Read the input file and convert it into RDD of serializable object
|
|
||||||
ModelSupport.oafTypes.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name)
|
|
||||||
.map(s -> new ObjectMapper().readValue(s, clazz))
|
|
||||||
.rdd(), Encoders.bean(clazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.saveAsTable(hiveDbName + "." + name));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
|
||||||
return SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(SparkGraphImporterJob.class.getSimpleName())
|
|
||||||
.master(parser.get("master"))
|
|
||||||
.config(conf)
|
|
||||||
.enableHiveSupport()
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -49,7 +49,7 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>MapGraphAsHiveDB</name>
|
<name>MapGraphAsHiveDB</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.SparkGraphImporterJob</class>
|
<class>eu.dnetlib.dhp.oa.graph.GraphHiveImporterJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory ${sparkExecutorMemory}
|
--executor-memory ${sparkExecutorMemory}
|
|
@ -0,0 +1,26 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "when true will stop SparkSession after job execution",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "in",
|
||||||
|
"paramLongName": "inputPath",
|
||||||
|
"paramDescription": "the path to the graph data dump to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "hmu",
|
||||||
|
"paramLongName": "hiveMetastoreUris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "db",
|
||||||
|
"paramLongName": "hiveDbName",
|
||||||
|
"paramDescription": "the target hive database name",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -1,6 +0,0 @@
|
||||||
[
|
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
|
||||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
|
||||||
{"paramName":"h", "paramLongName":"hive_metastore_uris","paramDescription": "the hive metastore uris", "paramRequired": true},
|
|
||||||
{"paramName":"db", "paramLongName":"hive_db_name", "paramDescription": "the target hive database name", "paramRequired": true}
|
|
||||||
]
|
|
|
@ -0,0 +1,92 @@
|
||||||
|
package eu.dnetlib.dhp.oa.graph;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.lang3.RandomStringUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class GraphHiveImporterJobTest {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJobTest.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private static final ClassLoader cl = GraphHiveImporterJobTest.class.getClassLoader();
|
||||||
|
|
||||||
|
public static final String JDBC_DERBY_TEMPLATE = "jdbc:derby:;databaseName=%s/junit_metastore_db;create=true";
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
private static Path workingDir;
|
||||||
|
|
||||||
|
private static String dbName;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void beforeAll() throws IOException {
|
||||||
|
workingDir = Files.createTempDirectory(GraphHiveImporterJobTest.class.getSimpleName());
|
||||||
|
log.info("using work dir {}", workingDir);
|
||||||
|
|
||||||
|
dbName = RandomStringUtils.randomAlphabetic(5);
|
||||||
|
log.info("using DB name {}", "test_" + dbName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(GraphHiveImporterJobTest.class.getSimpleName());
|
||||||
|
|
||||||
|
conf.setMaster("local[*]");
|
||||||
|
conf.set("spark.driver.host", "localhost");
|
||||||
|
conf.set("hive.metastore.local", "true");
|
||||||
|
conf.set("spark.ui.enabled", "false");
|
||||||
|
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||||
|
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||||
|
conf.set("javax.jdo.option.ConnectionURL", String.format(JDBC_DERBY_TEMPLATE, workingDir.resolve("warehouse").toString()));
|
||||||
|
|
||||||
|
spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(GraphHiveImporterJobTest.class.getSimpleName())
|
||||||
|
.config(conf)
|
||||||
|
.enableHiveSupport()
|
||||||
|
.getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void afterAll() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testImportGraphAsHiveDB() throws Exception {
|
||||||
|
|
||||||
|
GraphHiveImporterJob.main(new String[]{
|
||||||
|
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"-inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
|
||||||
|
"-hiveMetastoreUris", "",
|
||||||
|
"-hiveDbName", dbName
|
||||||
|
});
|
||||||
|
|
||||||
|
ModelSupport.oafTypes.forEach((name, clazz) -> {
|
||||||
|
long count = spark.read().table(dbName + "." + name).count();
|
||||||
|
int expected = name.equals("relation") ? 100 : 10;
|
||||||
|
|
||||||
|
Assertions.assertEquals(expected, count, String.format("%s should be %s", name, expected));
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,54 +0,0 @@
|
||||||
package eu.dnetlib.dhp.oa.graph;
|
|
||||||
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.junit.jupiter.api.io.TempDir;
|
|
||||||
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
public class SparkGraphImporterJobTest {
|
|
||||||
|
|
||||||
private final static String TEST_DB_NAME = "test";
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testImport(@TempDir Path outPath) {
|
|
||||||
try(SparkSession spark = testSparkSession(outPath.toString())) {
|
|
||||||
|
|
||||||
new SparkGraphImporterJob().runWith(
|
|
||||||
spark,
|
|
||||||
getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
|
|
||||||
TEST_DB_NAME);
|
|
||||||
|
|
||||||
GraphMappingUtils.types.forEach((name, clazz) -> {
|
|
||||||
final long count = spark.read().table(TEST_DB_NAME + "." + name).count();
|
|
||||||
if (name.equals("relation")) {
|
|
||||||
Assertions.assertEquals(100, count, String.format("%s should be 100", name));
|
|
||||||
} else {
|
|
||||||
Assertions.assertEquals(10, count, String.format("%s should be 10", name));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private SparkSession testSparkSession(final String inputPath) {
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
|
|
||||||
conf.set("spark.driver.host", "localhost");
|
|
||||||
conf.set("hive.metastore.local", "true");
|
|
||||||
conf.set("hive.metastore.warehouse.dir", inputPath + "/warehouse");
|
|
||||||
conf.set("spark.sql.warehouse.dir", inputPath);
|
|
||||||
conf.set("javax.jdo.option.ConnectionURL", String.format("jdbc:derby:;databaseName=%s/junit_metastore_db;create=true", inputPath));
|
|
||||||
conf.set("spark.ui.enabled", "false");
|
|
||||||
|
|
||||||
return SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(SparkGraphImporterJobTest.class.getSimpleName())
|
|
||||||
.master("local[*]")
|
|
||||||
.config(conf)
|
|
||||||
.enableHiveSupport()
|
|
||||||
.getOrCreate();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue