diff --git a/dhp-shade-package/dependency-reduced-pom.xml b/dhp-shade-package/dependency-reduced-pom.xml index 04843072f..ca2c6fd30 100644 --- a/dhp-shade-package/dependency-reduced-pom.xml +++ b/dhp-shade-package/dependency-reduced-pom.xml @@ -95,6 +95,10 @@ byte-buddy-agent net.bytebuddy + + objenesis + org.objenesis + @@ -102,6 +106,12 @@ mockito-junit-jupiter 3.3.3 test + + + junit-jupiter-api + org.junit.jupiter + + diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java index 1adad104e..8172456bb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java @@ -160,7 +160,7 @@ public class ORCIDExtractor extends Thread { } } finally { for (SequenceFile.Writer k : fileMap.values()) { - log.info("Thread {}: Completed processed {} items", id, extractedItem); + log.info("Thread {}: Completed processed {} items", id, extractedItem); k.hflush(); k.close(); } diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index d7ae60a91..3cbc5477c 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -88,6 +88,12 @@ eu.dnetlib.dhp dhp-common ${project.version} + + + log4j + log4j + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml index 9608732ed..7fbd5e301 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/config-default.xml @@ -23,4 +23,8 @@ hiveDbName openaire + + sparkSqlWarehouseDir + /user/hive/warehouse + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql new file mode 100644 index 000000000..456713612 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql @@ -0,0 +1,49 @@ + +CREATE TEMPORARY VIEW datasource USING json OPTIONS ( path "${inputPath}/datasource"); /*EOS*/ +CREATE TABLE IF NOT EXISTS ${hiveDbName}.datasource + USING parquet + CLUSTERED BY ( id ) INTO 200 BUCKETS + AS SELECT * FROM datasource DISTRIBUTE BY id; /*EOS*/ + +CREATE TEMPORARY VIEW dataset USING json OPTIONS ( path "${inputPath}/dataset"); /*EOS*/ +CREATE TABLE IF NOT EXISTS ${hiveDbName}.dataset + USING parquet + CLUSTERED BY ( id ) INTO 4000 BUCKETS + AS SELECT * FROM dataset DISTRIBUTE BY id; /*EOS*/ + +CREATE TEMPORARY VIEW organization USING json OPTIONS ( path "${inputPath}/organization"); /*EOS*/ +CREATE TABLE IF NOT EXISTS ${hiveDbName}.organization + USING parquet + CLUSTERED BY ( id ) INTO 1000 BUCKETS + AS SELECT * FROM organization DISTRIBUTE BY id; /*EOS*/ + +CREATE TEMPORARY VIEW otherresearchproduct USING json OPTIONS ( path "${inputPath}/otherresearchproduct"); /*EOS*/ +CREATE TABLE IF NOT EXISTS ${hiveDbName}.otherresearchproduct + USING parquet + CLUSTERED BY ( id ) INTO 8000 BUCKETS + AS SELECT * FROM otherresearchproduct DISTRIBUTE BY id; /*EOS*/ + +CREATE TEMPORARY VIEW project USING json OPTIONS ( path "${inputPath}/project"); /*EOS*/ +CREATE TABLE IF NOT EXISTS ${hiveDbName}.project + USING parquet + CLUSTERED BY ( id ) INTO 1000 BUCKETS + AS SELECT * FROM project DISTRIBUTE BY id; /*EOS*/ + +CREATE TEMPORARY VIEW publication USING json OPTIONS ( path "${inputPath}/publication"); /*EOS*/ +CREATE TABLE IF NOT EXISTS ${hiveDbName}.publication + USING parquet + CLUSTERED BY ( id ) INTO 10000 BUCKETS + AS SELECT * FROM publication DISTRIBUTE BY id; /*EOS*/ + +CREATE TEMPORARY VIEW relation USING json OPTIONS ( path "${inputPath}/relation"); /*EOS*/ +DROP TABLE IF EXISTS ${hiveDbName}.relation; /*EOS*/ +CREATE TABLE IF NOT EXISTS ${hiveDbName}.relation + USING parquet + PARTITIONED BY ( relClass ) + AS SELECT * FROM relation DISTRIBUTE BY source,target; /*EOS*/ + +CREATE TEMPORARY VIEW software USING json OPTIONS ( path "${inputPath}/software"); /*EOS*/ +CREATE TABLE IF NOT EXISTS ${hiveDbName}.software + USING parquet + CLUSTERED BY ( id ) INTO 1000 BUCKETS + AS SELECT * FROM software DISTRIBUTE BY source; /*EOS*/ \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index 872ef8a2d..37dd3523c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -51,6 +51,22 @@ spark2EventLogDir spark 2.* event log dir location + + + sparkClusterOpts + --conf spark.network.timeout=600 --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory + spark cluster-wide options + + + sparkResourceOpts + --executor-memory=3G --conf spark.executor.memoryOverhead=3G --executor-cores=6 --driver-memory=8G --driver-cores=4 + spark resource options + + + sparkApplicationOpts + --conf spark.sql.shuffle.partitions=3840 + spark resource options + @@ -90,285 +106,32 @@ hiveDbName=${hiveDbName} - + - - - - - - - - - - - - - + yarn cluster - Import table publication - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + Import graph tables into Hive + eu.dnetlib.dhp.oozie.RunSQLSparkJob dhp-graph-mapper-${projectVersion}.jar - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=10000 + ${sparkClusterOpts} + ${sparkResourceOpts} + ${sparkApplicationOpts} - --inputPath${inputPath}/publication - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Publication --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions10000 + --sqleu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/import_graph.sql + --hiveDbName${hiveDbName} + --inputPath${inputPath} - + - - - yarn - cluster - Import table dataset - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=4000 - - --inputPath${inputPath}/dataset - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Dataset - --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions8000 - - - - - - - - yarn - cluster - Import table otherresearchproduct - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=8000 - - --inputPath${inputPath}/otherresearchproduct - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions3000 - - - - - - - - yarn - cluster - Import table software - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=1000 - - --inputPath${inputPath}/software - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Software - --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions1000 - - - - - - - - yarn - cluster - Import table datasource - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=200 - - --inputPath${inputPath}/datasource - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Datasource - --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions200 - - - - - - - - yarn - cluster - Import table organization - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=1000 - - --inputPath${inputPath}/organization - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Organization - --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions1000 - - - - - - - - yarn - cluster - Import table project - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=1000 - - --inputPath${inputPath}/project - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Project - --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions1000 - - - - - - - - yarn - cluster - Import table person - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=1000 - - --inputPath${inputPath}/person - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Person - --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions1000 - - - - - - - - yarn - cluster - Import table relation - eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --conf spark.sql.shuffle.partitions=15000 - - --inputPath${inputPath}/relation - --hiveDbName${hiveDbName} - --classNameeu.dnetlib.dhp.schema.oaf.Relation - --hiveMetastoreUris${hiveMetastoreUris} - --numPartitions15000 - - - - - - - diff --git a/pom.xml b/pom.xml index 033d88b0b..826d7cbf0 100644 --- a/pom.xml +++ b/pom.xml @@ -988,6 +988,9 @@ + + spark-24 + spark-34