diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java index 96e6cefb5..238b2b30b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/funderresults/SparkDumpFunderResults.java @@ -81,7 +81,7 @@ public class SparkDumpFunderResults implements Serializable { }).collect(Collectors.toList()).iterator(), Encoders.STRING()) .distinct(); - tmp.foreach((ForeachFunction) f -> log.info("Found Funder {}", f)); + List funderList = tmp.collectAsList(); funderList.forEach(funder -> { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app/config-default.xml deleted file mode 100644 index e5ec3d0ae..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app/config-default.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - hiveMetastoreUris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hiveJdbcUrl - jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 - - - hiveDbName - openaire - - - oozie.launcher.mapreduce.user.classpath.first - true - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app/workflow.xml deleted file mode 100644 index 1a0c71a71..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app/workflow.xml +++ /dev/null @@ -1,345 +0,0 @@ - - - - - sourcePath - the source path - - - outputPath - the output path - - - communityMapPath - the path to the community map - - - selectedResults - the path the the possible subset ot results to be dumped - - - hiveDbName - the target hive database name - - - hiveJdbcUrl - hive server jdbc url - - - hiveMetastoreUris - hive server metastore URIs - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - yarn - cluster - Dump table publication for community/funder related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${selectedResults}/publication - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/dump/publication - --communityMapPath${communityMapPath} - --dumpType${dumpType} - - - - - - - - yarn - cluster - Dump table dataset for community/funder related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${selectedResults}/dataset - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/dump/dataset - --communityMapPath${communityMapPath} - - - - - - - - yarn - cluster - Dump table ORP for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${selectedResults}/otherresearchproduct - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/dump/otherresearchproduct - --communityMapPath${communityMapPath} - - - - - - - - yarn - cluster - Dump table software for community related products - eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${selectedResults}/software - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/dump/software - --communityMapPath${communityMapPath} - - - - - - - - - - yarn - cluster - Prepare association result subset of project info - eu.dnetlib.dhp.oa.graph.dump.community.SparkPrepareResultProject - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${sourcePath} - --outputPath${workingDir}/preparedInfo - - - - - - - - - - - - - - - yarn - cluster - Extend dumped publications with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dump/publication - --outputPath${outputPath}/ext/publication - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped dataset with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dump/dataset - --outputPath${outputPath}/ext/dataset - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped ORP with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dump/otherresearchproduct - --outputPath${outputPath}/ext/orp - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - yarn - cluster - Extend dumped software with information about project - eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/dump/software - --outputPath${outputPath}/ext/software - --preparedInfoPath${workingDir}/preparedInfo - - - - - - - - - - - diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/import.txt b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/import.txt deleted file mode 100644 index 15387e988..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/import.txt +++ /dev/null @@ -1,2 +0,0 @@ -## This is a classpath-based import file (this header is required) -dump_common classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/workflow.xml index 319517026..b559a25dc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app/workflow.xml @@ -77,42 +77,259 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - ${wf:appPath()}/dump_common - - - - - sourcePath - ${sourcePath} - - - selectedResults - ${sourcePath} - - - communityMapPath - ${workingDir}/communityMap - - - outputPath - ${workingDir} - - - - - + + + + + + + + + + yarn + cluster + Dump table publication for community/funder related products + eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${selectedResults}/publication + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/dump/publication + --communityMapPath${communityMapPath} + --dumpType${dumpType} + + + + + + + + yarn + cluster + Dump table dataset for community/funder related products + eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${selectedResults}/dataset + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/dump/dataset + --communityMapPath${communityMapPath} + + + + + + + + yarn + cluster + Dump table ORP for community related products + eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${selectedResults}/otherresearchproduct + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/dump/otherresearchproduct + --communityMapPath${communityMapPath} + + + + + + + + yarn + cluster + Dump table software for community related products + eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${selectedResults}/software + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/dump/software + --communityMapPath${communityMapPath} + + + + + + + + + + yarn + cluster + Prepare association result subset of project info + eu.dnetlib.dhp.oa.graph.dump.community.SparkPrepareResultProject + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --outputPath${workingDir}/preparedInfo + + + + + + + + + + + + + + + yarn + cluster + Extend dumped publications with information about project + eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/dump/publication + --outputPath${outputPath}/ext/publication + --preparedInfoPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + Extend dumped dataset with information about project + eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/dump/dataset + --outputPath${outputPath}/ext/dataset + --preparedInfoPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + Extend dumped ORP with information about project + eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/dump/otherresearchproduct + --outputPath${outputPath}/ext/orp + --preparedInfoPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + Extend dumped software with information about project + eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/dump/software + --outputPath${outputPath}/ext/software + --preparedInfoPath${workingDir}/preparedInfo + + + - + yarn diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/funder/oozie_app/import.txt b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/funder/oozie_app/import.txt deleted file mode 100644 index 15387e988..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/funder/oozie_app/import.txt +++ /dev/null @@ -1,2 +0,0 @@ -## This is a classpath-based import file (this header is required) -dump_common classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app \ No newline at end of file