try to make workflow and sub-workflow for making report and actual orcid cleaning

2020-11-06 17:19:28 +01:00 · 2020-11-06 17:19:28 +01:00 · 902b0db85a
parent c56a43c90b
commit 902b0db85a
6 changed files with 684 additions and 10 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/oozie_app/workflow.xml
@ -81,7 +81,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>prepare publication</name>
+            <name>orcid prepare publication</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
@ -107,7 +107,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>prepare publication</name>
+            <name>orcid prepare dataset</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
@ -133,7 +133,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>prepare publication</name>
+            <name>orcid prepare software</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
@ -159,7 +159,7 @@
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>prepare publication</name>
+            <name>orcid prepare orp</name>
            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
@ -187,6 +187,7 @@
        <switch>
            <case to="make_report">${wf:conf('clean') eq false}</case>
            <case to="clean_orcid_copy">${wf:conf('clean') eq true}</case>
+            <default to="make_report"/>
        </switch>
    </decision>

@ -242,7 +243,7 @@
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
-            <arg>--outputPath</arg><arg>${utputPath}/dataset</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
@ -269,7 +270,7 @@
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
@ -353,9 +354,9 @@

    <fork name="clean_orcid">
        <path start="clean_publication_orcid"/>
-        <path start="copy_organization"/>
-        <path start="copy_projects"/>
-        <path start="copy_datasources"/>
+        <path start="clean_dataset_orcid"/>
+        <path start="clean_orp_orcid"/>
+        <path start="clean_software_orcid"/>
    </fork>

    <action name="clean_publication_orcid">
@ -430,7 +431,7 @@
                --conf spark.sql.shuffle.partitions=7680
            </spark-opts>
            <arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
-            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/clean/oozie_app/workflow.xml
@ -0,0 +1,232 @@
+<workflow-app name="clean_orcid" xmlns="uri:oozie:workflow:0.5">
+
+    <parameters>
+        <property>
+            <name>inputPath</name>
+            <description>the input path to read graph content</description>
+        </property>
+        <property>
+            <name>outputPath</name>
+            <description>the target path to store cleaned graph</description>
+        </property>
+        <property>
+            <name>orcidInputPath</name>
+            <description>the input path where to find the orcid authoritative information</description>
+        </property>
+        <property>
+            <name>inputPreparedInputPath</name>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <start to="reset_outputpath"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="reset_outputpath">
+        <fs>
+            <delete path="${outputPath}"/>
+            <mkdir path="${outputPath}"/>
+        </fs>
+        <ok to="clean_orcid_copy"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <fork name="clean_orcid_copy">
+        <path start="copy_relation"/>
+        <path start="copy_organization"/>
+        <path start="copy_projects"/>
+        <path start="copy_datasources"/>
+    </fork>
+
+    <action name="copy_relation">
+        <distcp xmlns="uri:oozie:distcp-action:0.2">
+            <arg>${nameNode}/${inputPath}/relation</arg>
+            <arg>${nameNode}/${outputPath}/relation</arg>
+        </distcp>
+        <ok to="wait_copy"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="copy_organization">
+        <distcp xmlns="uri:oozie:distcp-action:0.2">
+            <arg>${nameNode}/${inputPath}/organization</arg>
+            <arg>${nameNode}/${outputPath}/organization</arg>
+        </distcp>
+        <ok to="wait_copy"/>
+        <error to="Kill"/>
+    </action>
+    <action name="copy_projects">
+        <distcp xmlns="uri:oozie:distcp-action:0.2">
+            <arg>${nameNode}/${inputPath}/project</arg>
+            <arg>${nameNode}/${outputPath}/project</arg>
+        </distcp>
+        <ok to="wait_copy"/>
+        <error to="Kill"/>
+    </action>
+    <action name="copy_datasources">
+        <distcp xmlns="uri:oozie:distcp-action:0.2">
+            <arg>${nameNode}/${sourcePath}/datasource</arg>
+            <arg>${nameNode}/${outputPath}/datasource</arg>
+        </distcp>
+        <ok to="wait_copy"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="wait_copy" to="clean_orcid"/>
+
+    <fork name="clean_orcid">
+        <path start="clean_publication_orcid"/>
+        <path start="clean_dataset_orcid"/>
+        <path start="clean_orp_orcid"/>
+        <path start="clean_software_orcid"/>
+    </fork>
+
+    <action name="clean_publication_orcid">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean ORCID for Publications</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.CleanAuthorPidsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+            <arg>--preparedInfoPath</arg><arg>${inputPreparedInfoPath}/publication</arg>
+            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
+        </spark>
+        <ok to="wait_clean"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="clean_dataset_orcid">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean ORCID for Datasets</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.CleanAuthorPidsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--preparedInfoPath</arg><arg>${inputPreparedInfoPath}/dataset</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
+            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
+        </spark>
+        <ok to="wait_clean"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="clean_orp_orcid">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean ORCID for ORP</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.CleanAuthorPidsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--preparedInfoPath</arg><arg>${inputPreparedInfoPath}/otherresearchproduct</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
+            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
+        </spark>
+        <ok to="wait_clean"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="clean_software_orcid">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Clean ORCID for Software</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.CleanAuthorPidsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--preparedInfoPath</arg><arg>${inputPreparedInfoPath}/software</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/software</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
+            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
+        </spark>
+        <ok to="wait_clean"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="wait_clean" to="End"/>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/import.txt
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/import.txt
@ -0,0 +1,3 @@
+## This is a classpath-based import file (this header is required)
+make_report classpath eu/dnetlib/dhp/oa/graph/clean_orcid/wf/report/oozie_app
+clean_orcid classpath eu/dnetlib/dhp/oa/graph/clean_orcid/wf/clean/oozie_app
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/config-default.xml
@ -0,0 +1,18 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/main/oozie_app/workflow.xml
@ -0,0 +1,231 @@
+<workflow-app name="orcid_status" xmlns="uri:oozie:workflow:0.5">
+
+    <parameters>
+        <property>
+            <name>inputPath</name>
+            <description>the input path to read graph content</description>
+        </property>
+        <property>
+            <name>outputPath</name>
+            <description>the target path to store cleaned graph</description>
+        </property>
+        <property>
+            <name>orcidInputPath</name>
+            <description>the input path where to find the orcid authoritative information</description>
+        </property>
+        <property>
+            <name>clean</name>
+            <value>false</value>
+            <description>determines if the orcid should be cleaned in the graph (true) or the report should be produced (false)
+            </description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <start to="reset_outputpath"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="reset_outputpath">
+        <fs>
+            <delete path="${outputPath}"/>
+            <mkdir path="${outputPath}"/>
+        </fs>
+        <ok to="perpare_result"/>
+        <error to="Kill"/>
+    </action>
+
+    <fork name="perpare_result">
+        <path start="prepare_publication"/>
+        <path start="prepare_dataset"/>
+        <path start="prepare_software"/>
+        <path start="prepare_orp"/>
+    </fork>
+
+    <action name="prepare_publication">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>orcid prepare publication</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/publication</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+        </spark>
+        <ok to="wait_prepare"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="prepare_dataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>orcid prepare dataset</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+        </spark>
+        <ok to="wait_prepare"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="prepare_software">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>orcid prepare software</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/software</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+        </spark>
+        <ok to="wait_prepare"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="prepare_orp">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>orcid prepare orp</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.PrepareResultsSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+        </spark>
+        <ok to="wait_prepare"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="wait_prepare" to="cleanorreport"/>
+
+    <decision name="cleanorreport">
+        <switch>
+            <case to="make_report">${wf:conf('clean') eq false}</case>
+            <case to="clean_orcid_copy">${wf:conf('clean') eq true}</case>
+            <default to="make_report"/>
+        </switch>
+    </decision>
+
+
+    <action name="make_report">
+        <sub-workflow>
+            <app-path>${wf:appPath()}/make_report</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>inputPreparedInfoPath</name>
+                    <value>${workingDir}</value>
+                </property>
+                <property>
+                    <name>orcidInputPath</name>
+                    <value>${orcidInputPath}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="clean_orcid">
+        <sub-workflow>
+            <app-path>${wf:appPath()}/clean_orcid</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>inputPreparedInfoPath</name>
+                    <value>${workingDir}</value>
+                </property>
+                <property>
+                    <name>orcidInputPath</name>
+                    <value>${orcidInputPath}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/report/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean_orcid/wf/report/oozie_app/workflow.xml
@ -0,0 +1,189 @@
+<workflow-app name="orcid_report" xmlns="uri:oozie:workflow:0.5">
+
+    <parameters>
+        <property>
+            <name>inputPath</name>
+            <description>the input path to read graph content</description>
+        </property>
+        <property>
+            <name>outputPath</name>
+            <description>the target path to store cleaned graph</description>
+        </property>
+        <property>
+            <name>orcidInputPath</name>
+            <description>the input path where to find the orcid authoritative information</description>
+        </property>
+        <property>
+            <name>inputPreparedInfoPath</name>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <start to="reset_outputpath"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="reset_outputpath">
+        <fs>
+            <delete path="${outputPath}"/>
+            <mkdir path="${outputPath}"/>
+        </fs>
+        <ok to="make_report"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <fork name="make_report">
+        <path start="report_publication"/>
+        <path start="report_dataset"/>
+        <path start="report_software"/>
+        <path start="report_orp"/>
+    </fork>
+
+    <action name="report_publication">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Report ORCID on Publication</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.MakeReportSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${inputPath}/publication</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+            <arg>--preparedInfoPath</arg><arg>${inputPreparedInfoPath}/publication</arg>
+            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
+        </spark>
+        <ok to="wait_report"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="report_dataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Report ORCID on Dataset</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.MakeReportSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--preparedInfoPath</arg><arg>${inputPreparedInfoPath}/dataset</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+            <arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
+            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
+        </spark>
+        <ok to="wait_report"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="report_orp">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Report ORCID on ORP</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.MakeReportSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--preparedInfoPath</arg><arg>${inputPreparedInfoPath}/otherresearchproduct</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+            <arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
+            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
+        </spark>
+        <ok to="wait_report"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="report_software">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Report ORCID on Softwar</name>
+            <class>eu.dnetlib.dhp.oa.graph.clean.authorpids.MakeReportSparkJob</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--preparedInfoPath</arg><arg>${inputPreparedInfoPath}/software</arg>
+            <arg>--outputPath</arg><arg>${outputPath}/software</arg>
+            <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+            <arg>--inputPath</arg><arg>${inputPath}/software</arg>
+            <arg>--orcidInputPath</arg><arg>${orcidInputPath}</arg>
+        </spark>
+        <ok to="wait_report"/>
+        <error to="Kill"/>
+    </action>
+
+    <join name="wait_report" to="End"/>
+
+    <end name="End"/>
+</workflow-app>