diff --git a/dhp-shade-package/dependency-reduced-pom.xml b/dhp-shade-package/dependency-reduced-pom.xml
index 04843072f..ca2c6fd30 100644
--- a/dhp-shade-package/dependency-reduced-pom.xml
+++ b/dhp-shade-package/dependency-reduced-pom.xml
@@ -95,6 +95,10 @@
byte-buddy-agent
net.bytebuddy
+
+ objenesis
+ org.objenesis
+
@@ -102,6 +106,12 @@
mockito-junit-jupiter
3.3.3
test
+
+
+ junit-jupiter-api
+ org.junit.jupiter
+
+
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java
index 1adad104e..8172456bb 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java
@@ -160,7 +160,7 @@ public class ORCIDExtractor extends Thread {
}
} finally {
for (SequenceFile.Writer k : fileMap.values()) {
- log.info("Thread {}: Completed processed {} items", id, extractedItem);
+ log.info("Thread {}: Completed processed {} items", id, extractedItem);
k.hflush();
k.close();
}
diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml
index d7ae60a91..3cbc5477c 100644
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@@ -88,6 +88,12 @@
eu.dnetlib.dhp
dhp-common
${project.version}
+
+
+ log4j
+ log4j
+
+
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml
index 9608732ed..7fbd5e301 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml
@@ -23,4 +23,8 @@
hiveDbName
openaire
+
+ sparkSqlWarehouseDir
+ /user/hive/warehouse
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql
new file mode 100644
index 000000000..456713612
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql
@@ -0,0 +1,49 @@
+
+CREATE TEMPORARY VIEW datasource USING json OPTIONS ( path "${inputPath}/datasource"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.datasource
+ USING parquet
+ CLUSTERED BY ( id ) INTO 200 BUCKETS
+ AS SELECT * FROM datasource DISTRIBUTE BY id; /*EOS*/
+
+CREATE TEMPORARY VIEW dataset USING json OPTIONS ( path "${inputPath}/dataset"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.dataset
+ USING parquet
+ CLUSTERED BY ( id ) INTO 4000 BUCKETS
+ AS SELECT * FROM dataset DISTRIBUTE BY id; /*EOS*/
+
+CREATE TEMPORARY VIEW organization USING json OPTIONS ( path "${inputPath}/organization"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.organization
+ USING parquet
+ CLUSTERED BY ( id ) INTO 1000 BUCKETS
+ AS SELECT * FROM organization DISTRIBUTE BY id; /*EOS*/
+
+CREATE TEMPORARY VIEW otherresearchproduct USING json OPTIONS ( path "${inputPath}/otherresearchproduct"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.otherresearchproduct
+ USING parquet
+ CLUSTERED BY ( id ) INTO 8000 BUCKETS
+ AS SELECT * FROM otherresearchproduct DISTRIBUTE BY id; /*EOS*/
+
+CREATE TEMPORARY VIEW project USING json OPTIONS ( path "${inputPath}/project"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.project
+ USING parquet
+ CLUSTERED BY ( id ) INTO 1000 BUCKETS
+ AS SELECT * FROM project DISTRIBUTE BY id; /*EOS*/
+
+CREATE TEMPORARY VIEW publication USING json OPTIONS ( path "${inputPath}/publication"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.publication
+ USING parquet
+ CLUSTERED BY ( id ) INTO 10000 BUCKETS
+ AS SELECT * FROM publication DISTRIBUTE BY id; /*EOS*/
+
+CREATE TEMPORARY VIEW relation USING json OPTIONS ( path "${inputPath}/relation"); /*EOS*/
+DROP TABLE IF EXISTS ${hiveDbName}.relation; /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.relation
+ USING parquet
+ PARTITIONED BY ( relClass )
+ AS SELECT * FROM relation DISTRIBUTE BY source,target; /*EOS*/
+
+CREATE TEMPORARY VIEW software USING json OPTIONS ( path "${inputPath}/software"); /*EOS*/
+CREATE TABLE IF NOT EXISTS ${hiveDbName}.software
+ USING parquet
+ CLUSTERED BY ( id ) INTO 1000 BUCKETS
+ AS SELECT * FROM software DISTRIBUTE BY source; /*EOS*/
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
index 872ef8a2d..37dd3523c 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml
@@ -51,6 +51,22 @@
spark2EventLogDir
spark 2.* event log dir location
+
+
+ sparkClusterOpts
+ --conf spark.network.timeout=600 --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory
+ spark cluster-wide options
+
+
+ sparkResourceOpts
+ --executor-memory=3G --conf spark.executor.memoryOverhead=3G --executor-cores=6 --driver-memory=8G --driver-cores=4
+ spark resource options
+
+
+ sparkApplicationOpts
+ --conf spark.sql.shuffle.partitions=3840
+ spark resource options
+
@@ -90,285 +106,32 @@
hiveDbName=${hiveDbName}
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+
yarn
cluster
- Import table publication
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
+ Import graph tables into Hive
+ eu.dnetlib.dhp.oozie.RunSQLSparkJob
dhp-graph-mapper-${projectVersion}.jar
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=10000
+ ${sparkClusterOpts}
+ ${sparkResourceOpts}
+ ${sparkApplicationOpts}
- --inputPath${inputPath}/publication
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.Publication
--hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions10000
+ --sqleu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql
+ --hiveDbName${hiveDbName}
+ --inputPath${inputPath}
-
+
-
-
- yarn
- cluster
- Import table dataset
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=4000
-
- --inputPath${inputPath}/dataset
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.Dataset
- --hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions8000
-
-
-
-
-
-
-
- yarn
- cluster
- Import table otherresearchproduct
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=8000
-
- --inputPath${inputPath}/otherresearchproduct
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct
- --hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions3000
-
-
-
-
-
-
-
- yarn
- cluster
- Import table software
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=1000
-
- --inputPath${inputPath}/software
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.Software
- --hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions1000
-
-
-
-
-
-
-
- yarn
- cluster
- Import table datasource
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=200
-
- --inputPath${inputPath}/datasource
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.Datasource
- --hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions200
-
-
-
-
-
-
-
- yarn
- cluster
- Import table organization
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=1000
-
- --inputPath${inputPath}/organization
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.Organization
- --hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions1000
-
-
-
-
-
-
-
- yarn
- cluster
- Import table project
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=1000
-
- --inputPath${inputPath}/project
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.Project
- --hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions1000
-
-
-
-
-
-
-
- yarn
- cluster
- Import table person
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=1000
-
- --inputPath${inputPath}/person
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.Person
- --hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions1000
-
-
-
-
-
-
-
- yarn
- cluster
- Import table relation
- eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
- --conf spark.sql.shuffle.partitions=15000
-
- --inputPath${inputPath}/relation
- --hiveDbName${hiveDbName}
- --classNameeu.dnetlib.dhp.schema.oaf.Relation
- --hiveMetastoreUris${hiveMetastoreUris}
- --numPartitions15000
-
-
-
-
-
-
-
diff --git a/pom.xml b/pom.xml
index 033d88b0b..826d7cbf0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -988,6 +988,9 @@
+
+ spark-24
+
spark-34