diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index dc0529012..0cf6cdd05 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -13,6 +13,23 @@ isLookupUrl the address of the lookUp service + + shouldCleanContext + true if the context have to be cleaned + + + contextId + sobigdata + It is the context id that should be removed from the result if the condition is matched. + Now it is just sobigdata. In a futere implementation I plan to have the contextId as value in a json + where to specify also the constraints that should be verified to remove the context from the result + + + verifyParam + gcube + It is the constrint to be verified. This time is hardcoded as gcube and it is searched for in + the title. If title starts with gcube than the context sobigdata will be removed by the result if present + sparkDriverMemory @@ -275,7 +292,131 @@ - + + + + + ${wf:conf('shouldCleanContext') eq true} + + + + + + + + + + + + + + yarn + cluster + Clean publications context + eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + --workingPath${workingDir}/working/publication + --contextId${contextId} + --verifyParam${verifyParam} + + + + + + + + yarn + cluster + Clean datasets Context + eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + --workingPath${workingDir}/working/dataset + --contextId${contextId} + --verifyParam${verifyParam} + + + + + + + + yarn + cluster + Clean otherresearchproducts context + eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --workingPath${workingDir}/working/otherresearchproduct + --contextId${contextId} + --verifyParam${verifyParam} + + + + + + + + yarn + cluster + Clean softwares context + eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + --workingPath${workingDir}/working/software + --contextId${contextId} + --verifyParam${verifyParam} + + + + + + \ No newline at end of file