workflow and parameter

3 years ago · 0f1acdf6b6
parent 75780fc636
commit 0f1acdf6b6
3 changed files with 131 additions and 0 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json
@ -0,0 +1,21 @@
+[
+  {
+    "paramName": "s",
+    "paramLongName": "sourcePath",
+    "paramDescription": "the source mdstore path",
+    "paramRequired": true
+  },
+
+  {
+    "paramName": "t",
+    "paramLongName": "targetPath",
+    "paramDescription": "the target mdstore path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "m",
+    "paramLongName": "master",
+    "paramDescription": "the master name",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_dump_reader/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_dump_reader/oozie_app/config-default.xml
@ -0,0 +1,42 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>spark2YarnHistoryServerAddress</name>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
+    </property>
+    <property>
+        <name>spark2EventLogDir</name>
+        <value>/user/spark/spark2ApplicationHistory</value>
+    </property>
+    <property>
+        <name>spark2ExtraListeners</name>
+        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
+    </property>
+    <property>
+        <name>spark2SqlQueryExecutionListeners</name>
+        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_dump_reader/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_dump_reader/oozie_app/workflow.xml
@ -0,0 +1,68 @@
+<workflow-app name="read Crossref dump from HDFS" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <!--        <property>-->
+        <!--            <name>workingPath</name>-->
+        <!--            <description>the working dir base path</description>-->
+        <!--        </property>-->
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+
+    </parameters>
+
+    <start to="generateCrossrefDataset"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="ReadCrossRefDump">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</main-class>
+            <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
+            <arg>--workingPath</arg><arg>/data/doiboost/crossref/</arg>
+            <arg>--crossrefFileNameTarGz</arg><arg>crossref.tar.gz</arg>
+        </java>
+        <ok to="generateCrossrefDataset"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="generateCrossrefDataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>SparkCreateCrossredDataset</name>
+            <class>eu.dnetlib.doiboost.crossref.GenerateCrossrefDatasetSpark</class>
+            <jar>dhp-doiboost-${projectVersion}.jar</jar>
+            <spark-opts>
+                --conf spark.dynamicAllocation.enabled=true
+                --conf spark.dynamicAllocation.maxExecutors=20
+                --executor-memory=6G
+                --driver-memory=7G
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+            </spark-opts>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+            <arg>--sourcePath</arg><arg>/data/doiboost/crossref/filess</arg>
+            <arg>--targetPath</arg><arg>/tmp/miriam/crossref/crossrefDataset</arg>
+
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>