adding test and resouces

This commit is contained in:
Miriam Baglioni 2021-03-30 10:26:03 +02:00
parent efd34c63ae
commit d69c19e3fe
4 changed files with 343 additions and 77 deletions

View File

@ -0,0 +1,31 @@
[{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
}
,{
"paramName": "sp",
"paramLongName": "scholixPath",
"paramDescription": "the path of the scholix summaries",
"paramRequired": false
},
{
"paramName": "rp",
"paramLongName": "relationPath",
"paramDescription": "the openaire graph input path",
"paramRequired": false
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
},{
"paramName": "rePath",
"paramLongName": "resultPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
}
]

View File

@ -0,0 +1,35 @@
[{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
}
,{
"paramName": "ssp",
"paramLongName": "scholixSummaryPath",
"paramDescription": "the path of the scholix summaries",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the openaire graph input path",
"paramRequired": false
},{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
},{
"paramName": "sep",
"paramLongName": "scholixEnrichedPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
},{
"paramName": "oep",
"paramLongName": "openaireEnrichedPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
}
]

View File

@ -10,16 +10,24 @@
<name>nameNode</name> <name>nameNode</name>
<value>hdfs://nameservice1</value> <value>hdfs://nameservice1</value>
</property> </property>
<!-- <property>-->
<!-- <name>spark2YarnHistoryServerAddress</name>-->
<!-- <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>-->
<!-- </property>-->
<property> <property>
<name>hive_metastore_uris</name> <name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value> <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property> </property>
<property> <property>
<name>spark2YarnHistoryServerAddress</name> <name>hiveJdbcUrl</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value> <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property> </property>
<!-- GARR --> <!-- GARR -->
<!-- <property>--> <!-- <property>-->
@ -31,7 +39,7 @@
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>--> <!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
<!-- </property>--> <!-- </property>-->
<!-- <property>--> <!-- <property>-->
<!-- <name>hive_metastore_uris</name>--> <!-- <name>hiveMetastoreUris</name>-->
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>--> <!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
<!-- </property>--> <!-- </property>-->
<!-- <property>--> <!-- <property>-->
@ -53,16 +61,17 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property> <!-- <property>-->
<name>spark2EventLogDir</name> <!-- <name>spark2EventLogDir</name>-->
<value>/user/spark/spark2ApplicationHistory</value> <!-- <value>/user/spark/spark2ApplicationHistory</value>-->
</property> <!-- </property>-->
<property> <!-- <property>-->
<name>spark2ExtraListeners</name> <!-- <name>spark2ExtraListeners</name>-->
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value> <!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
</property> <!-- </property>-->
<property> <!-- <property>-->
<name>spark2SqlQueryExecutionListeners</name> <!-- <name>spark2SqlQueryExecutionListeners</name>-->
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value> <!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
</property> <!-- </property>-->
</configuration> </configuration>

View File

@ -1,8 +1,28 @@
<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Context Propagation Preparation" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>workingPath</name> <name>scholixPath</name>
<description>the Working Path</description> <description>the Scholix Path</description>
</property>
<property>
<name>scholixSummaryPath</name>
<description>the Scholix Summaries Path</description>
</property>
<property>
<name>inputPath</name>
<description>the OpenAIRE Graph Input Path</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property> </property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
@ -16,9 +36,51 @@
<name>sparkExecutorCores</name> <name>sparkExecutorCores</name>
<description>number of cores used by single executor</description> <description>number of cores used by single executor</description>
</property> </property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters> </parameters>
<start to="CreateEBIDataSet"/> <global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="select_scholix_relations"/>
<kill name="Kill"> <kill name="Kill">
@ -26,74 +88,203 @@
</kill> </kill>
<action name="GenerateBaselineDataset"> <action name="select_scholix_relations">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Create Baselnie DataSet</name> <name>Select Scholix Relations</name>
<class>eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame</class> <class>eu.dnetlib.dhp.contextpropagation.SparkSelectScholixRelations</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=1
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
${sparkExtraOPT} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts> </spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--inputPath</arg><arg>${scholixPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/scholixAllowedRelations</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="enrich_scholix_step1"/>
<error to="Kill"/>
</action>
<action name="enrich_scholix_step1">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrich Scholix Step1</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep1</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/scholixAllowedRelations</arg>
<arg>--scholixSummaryPath</arg><arg>${scholixSummaryPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/scholixEnriched</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="fork_enrich_scholix_step2"/>
<error to="Kill"/>
</action>
<fork name="fork_enrich_scholix_step2">
<path start="enrich_scholix_step2_publication"/>
<path start="enrich_scholix_step2_dataset"/>
<path start="enrich_scholix_step2_software"/>
<path start="enrich_scholix_step2_orp"/>
</fork>
<action name="enrich_scholix_step2_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
<arg>--resultPath</arg><arg>${inputPath}/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/publication</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="join_enrich"/>
<error to="Kill"/>
</action>
<action name="enrich_scholix_step2_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
<arg>--resultPath</arg><arg>${inputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/dataset</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="join_enrich"/>
<error to="Kill"/>
</action>
<action name="enrich_scholix_step2_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
<arg>--resultPath</arg><arg>${inputPath}/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/software</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="join_enrich"/>
<error to="Kill"/>
</action>
<action name="enrich_scholix_step2_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
<arg>--resultPath</arg><arg>${inputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/otherresearchproduct</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="join_enrich"/>
<error to="Kill"/>
</action>
<join name="join_enrich" to="enrich_scholix_step3"/>
<action name="enrich_scholix_step3">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep3</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixEnrichedPath</arg><arg>${workingDir}/scholixEnriched</arg>
<arg>--openaireEnrichedPath</arg><arg>${workingDir}/openaireEnriched</arg>
<arg>--outputPath</arg><arg>${workingDir}/enrichedEntities</arg>
<arg>--master</arg><arg>yarn</arg> <arg>--master</arg><arg>yarn</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="CreateEBIDataSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create EBI DataSet</name>
<class>eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=1000
${sparkExtraOPT}
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="GenerateUpdates"/>
<error to="Kill"/>
</action>
<action name="GenerateUpdates">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Baselnie DataSet</name>
<class>eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=1
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores}
${sparkExtraOPT}
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>