forked from D-Net/dnet-hadoop
modified resource file, workflow anf default-config. Add 3g of memory Overhead and specified the shuffle partition in the wf confiduration. Removed the multiple instantiation in the wf because of different implementation of the spark job
This commit is contained in:
parent
18bfff8af3
commit
34358afe75
|
@ -1,10 +1,5 @@
|
||||||
[
|
[
|
||||||
{
|
|
||||||
"paramName":"is",
|
|
||||||
"paramLongName":"isLookUpUrl",
|
|
||||||
"paramDescription": "URL of the isLookUp Service",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName":"s",
|
"paramName":"s",
|
||||||
"paramLongName":"sourcePath",
|
"paramLongName":"sourcePath",
|
||||||
|
@ -17,12 +12,7 @@
|
||||||
"paramDescription": "the json path associated to each selection field",
|
"paramDescription": "the json path associated to each selection field",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"paramName":"tn",
|
|
||||||
"paramLongName":"resultTableName",
|
|
||||||
"paramDescription": "the name of the result table we are currently working on",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName": "out",
|
"paramName": "out",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
|
@ -35,17 +25,19 @@
|
||||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"paramName": "test",
|
|
||||||
"paramLongName": "isTest",
|
|
||||||
"paramDescription": "Parameter intended for testing purposes only. True if the reun is relatesd to a test and so the taggingConf parameter should be loaded",
|
|
||||||
"paramRequired": false
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName": "tg",
|
"paramName": "tg",
|
||||||
"paramLongName": "taggingConf",
|
"paramLongName": "taggingConf",
|
||||||
"paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed",
|
"paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "production",
|
||||||
|
"paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
|
|
||||||
]
|
]
|
|
@ -45,10 +45,14 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkExecutorMemory</name>
|
<name>sparkExecutorMemory</name>
|
||||||
<value>6G</value>
|
<value>5G</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkExecutorCores</name>
|
<name>memoryOverhead</name>
|
||||||
<value>1</value>
|
<value>3g</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>partitions</name>
|
||||||
|
<value>3284</value>
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -4,10 +4,6 @@
|
||||||
<name>sourcePath</name>
|
<name>sourcePath</name>
|
||||||
<description>the source path</description>
|
<description>the source path</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>isLookUpUrl</name>
|
|
||||||
<description>the isLookup service endpoint</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>pathMap</name>
|
<name>pathMap</name>
|
||||||
<description>the json path associated to each selection field</description>
|
<description>the json path associated to each selection field</description>
|
||||||
|
@ -102,16 +98,9 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="copy_wait" to="fork_exec_bulktag"/>
|
<join name="copy_wait" to="exec_bulktag"/>
|
||||||
|
|
||||||
<fork name="fork_exec_bulktag">
|
<action name="exec_bulktag">
|
||||||
<path start="bulktag_publication"/>
|
|
||||||
<path start="bulktag_dataset"/>
|
|
||||||
<path start="bulktag_otherresearchproduct"/>
|
|
||||||
<path start="bulktag_software"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="bulktag_publication">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -122,104 +111,23 @@
|
||||||
--num-executors=${sparkExecutorNumber}
|
--num-executors=${sparkExecutorNumber}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--conf spark.executor.memoryOverhead=${memeoryOverhead}
|
||||||
|
--conf spark.sql.shuffle.partitions=${partitions}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--outputPath</arg><arg>${outputPath}/</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
|
||||||
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
<arg>--production</arg><arg>${production}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="bulktag_dataset">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>bulkTagging-dataset</name>
|
|
||||||
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--num-executors=${sparkExecutorNumber}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
|
||||||
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="bulktag_otherresearchproduct">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>bulkTagging-orp</name>
|
|
||||||
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--num-executors=${sparkExecutorNumber}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
|
||||||
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="bulktag_software">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>bulkTagging-software</name>
|
|
||||||
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--num-executors=${sparkExecutorNumber}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
|
||||||
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="wait" to="End"/>
|
|
||||||
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
Loading…
Reference in New Issue