Oozie workflow for cleancontext #216
|
@ -13,6 +13,23 @@
|
||||||
<name>isLookupUrl</name>
|
<name>isLookupUrl</name>
|
||||||
<description>the address of the lookUp service</description>
|
<description>the address of the lookUp service</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shouldCleanContext</name>
|
||||||
|
<description>true if the context have to be cleaned</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>contextId</name>
|
||||||
|
|||||||
|
<value>sobigdata</value>
|
||||||
|
<description>It is the context id that should be removed from the result if the condition is matched.
|
||||||
|
Now it is just sobigdata. In a futere implementation I plan to have the contextId as value in a json
|
||||||
|
where to specify also the constraints that should be verified to remove the context from the result</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>verifyParam</name>
|
||||||
|
<value>gcube </value>
|
||||||
|
<description>It is the constrint to be verified. This time is hardcoded as gcube and it is searched for in
|
||||||
|
the title. If title starts with gcube than the context sobigdata will be removed by the result if present</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
|
@ -275,7 +292,131 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait_clean" to="End"/>
|
<join name="wait_clean" to="clean_context"/>
|
||||||
|
|
||||||
|
<decision name="clean_context">
|
||||||
|
<switch>
|
||||||
|
<case to="fork_clean_context">${wf:conf('shouldCleanContext') eq true}</case>
|
||||||
|
<default to="End"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
|
||||||
|
<fork name="fork_clean_context">
|
||||||
|
<path start="clean_publication_context"/>
|
||||||
|
<path start="clean_dataset_context"/>
|
||||||
|
<path start="clean_otherresearchproduct_context"/>
|
||||||
|
<path start="clean_software_context"/>
|
||||||
|
</fork>
|
||||||
|
<action name="clean_publication_context">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Clean publications context</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingDir}/working/publication</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_clean_context"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="clean_dataset_context">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Clean datasets Context</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingDir}/working/dataset</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_clean_context"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="clean_otherresearchproduct_context">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Clean otherresearchproducts context</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_clean_context"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="clean_software_context">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Clean softwares context</name>
|
||||||
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||||
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingDir}/working/software</arg>
|
||||||
|
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||||
|
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="wait_clean_context"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name="wait_clean_context" to="End"/>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
Loading…
Reference in New Issue
It should be better to include a description for this parameter, to explain its purpose and if it possible to include multiple contentIds, how they should be formatted.
This is just the first naive implementation of the context cleaning. I have no idea how it will be once done properly
It might be the 1st naive implementation, but looking at the oozie workflow, it is not obvious what a parameter plays when it is not accompanied by any description.
extended