new workflow definition to be inserted in the provision pipeline

This commit is contained in:
Miriam Baglioni 2020-04-30 14:38:54 +02:00
parent 4b0bd91012
commit ce8b1d0bc3
7 changed files with 302 additions and 140 deletions

View File

@ -4,18 +4,6 @@
<name>sourcePath</name> <name>sourcePath</name>
<description>the source path</description> <description>the source path</description>
</property> </property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property> <property>
<name>isLookupUrl</name> <name>isLookupUrl</name>
<description>the isLookup service endpoint</description> <description>the isLookup service endpoint</description>
@ -24,6 +12,10 @@
<name>protoMap</name> <name>protoMap</name>
<description>the json path associated to each selection field</description> <description>the json path associated to each selection field</description>
</property> </property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
</parameters> </parameters>
<start to="reset-outputpath"/> <start to="reset-outputpath"/>
@ -34,27 +26,73 @@
<action name="reset-outputpath"> <action name="reset-outputpath">
<fs> <fs>
<delete path='${workingDir}/relation'/> <delete path='${outputPath}/relation'/>
<delete path='${workingDir}/publication'/> <delete path='${outputPath}/dataset'/>
<delete path='${workingDir}/dataset'/> <delete path='${outputPath}/software'/>
<delete path='${workingDir}/otherresearchproduct'/> <delete path='${outputPath}/publication'/>
<delete path='${workingDir}/software'/> <delete path='${outputPath}/otherresearchproduct'/>
<delete path='${outputPath}/project'/>
<delete path='${outputPath}/organization'/>
<delete path='${outputPath}/datasource'/>
</fs> </fs>
<ok to="copy_relation"/> <ok to="copy_entities"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${workingDir}/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
<ok to="fork_exec_bulktag"/> <ok to="copy_wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="fork_exec_bulktag"/>
<fork name="fork_exec_bulktag"> <fork name="fork_exec_bulktag">
<path start="join_bulktag_publication"/> <path start="join_bulktag_publication"/>
<path start="join_bulktag_dataset"/> <path start="join_bulktag_dataset"/>
@ -81,11 +119,11 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<!-- <arg>&#45;&#45;hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>-->
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg> <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--protoMap</arg><arg>${protoMap}</arg> <arg>--protoMap</arg><arg>${protoMap}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--saveGraph</arg><archive>${saveGraph}</archive>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -110,11 +148,11 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<!-- <arg>&#45;&#45;hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>-->
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg> <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--protoMap</arg><arg>${protoMap}</arg> <arg>--protoMap</arg><arg>${protoMap}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--saveGraph</arg><archive>${saveGraph}</archive>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -139,11 +177,11 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<!-- <arg>&#45;&#45;hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>-->
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--protoMap</arg><arg>${protoMap}</arg> <arg>--protoMap</arg><arg>${protoMap}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--saveGraph</arg><archive>${saveGraph}</archive>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -168,11 +206,11 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<!-- <arg>&#45;&#45;hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>-->
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg> <arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--protoMap</arg><arg>${protoMap}</arg> <arg>--protoMap</arg><arg>${protoMap}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--saveGraph</arg><archive>${saveGraph}</archive>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -13,24 +13,8 @@
<description>the allowed types</description> <description>the allowed types</description>
</property> </property>
<property> <property>
<name>sparkDriverMemory</name> <name>outputPath</name>
<description>memory for driver process</description> <description>the output path</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>sparkExecutorNumber</name>
<description>number of executors used</description>
</property>
<property>
<name>saveGraph</name>
<description>writes new version of the graph after the propagation step</description>
</property> </property>
</parameters> </parameters>
@ -43,26 +27,70 @@
<action name="reset-outputpath"> <action name="reset-outputpath">
<fs> <fs>
<delete path='${workingDir}/preparedInfo'/> <delete path='${workingDir}/preparedInfo'/>
<delete path='${workingDir}/relation'/> <delete path='${outputPath}/relation'/>
<delete path='${workingDir}/dataset'/> <delete path='${outputPath}/dataset'/>
<delete path='${workingDir}/software'/> <delete path='${outputPath}/software'/>
<delete path='${workingDir}/publication'/> <delete path='${outputPath}/publication'/>
<delete path='${workingDir}/otherresearchproduct'/> <delete path='${outputPath}/otherresearchproduct'/>
<delete path='${outputPath}/project'/>
<delete path='${outputPath}/organization'/>
<delete path='${outputPath}/datasource'/>
</fs> </fs>
<ok to="copy_relation"/> <ok to="copy_entities"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${workingDir}/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
<ok to="prepare_datasource_country_association"/> <ok to="copy_wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="prepare_datasource_country_association"/>
<action name="prepare_datasource_country_association"> <action name="prepare_datasource_country_association">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -117,7 +145,7 @@
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg> <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
@ -146,7 +174,7 @@
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg> <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
@ -175,7 +203,7 @@
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
@ -204,7 +232,7 @@
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg> <arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>

View File

@ -9,12 +9,8 @@
<description>the semantic relationships allowed for propagation</description> <description>the semantic relationships allowed for propagation</description>
</property> </property>
<property> <property>
<name>writeUpdate</name> <name>outputPath</name>
<description>writes the information found for the update. No double check done if the information is already present</description> <description>the output path</description>
</property>
<property>
<name>saveGraph</name>
<description>writes new version of the graph after the propagation step</description>
</property> </property>
</parameters> </parameters>
<start to="reset-outputpath"/> <start to="reset-outputpath"/>
@ -27,24 +23,72 @@
</kill> </kill>
<action name="reset-outputpath"> <action name="reset-outputpath">
<fs> <fs>
<delete path='${workingDir}/orcid_propagation'/>
<delete path='${workingDir}/preparedInfo'/> <delete path='${workingDir}/preparedInfo'/>
<delete path='${outputPath}/relation'/>
<delete path='${outputPath}/dataset'/>
<delete path='${outputPath}/software'/>
<delete path='${outputPath}/publication'/>
<delete path='${outputPath}/otherresearchproduct'/>
<delete path='${outputPath}/project'/>
<delete path='${outputPath}/organization'/>
<delete path='${outputPath}/datasource'/>
</fs> </fs>
<ok to="copy_relation"/> <ok to="copy_entities"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${workingDir}/orcid_propagation/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
<ok to="fork_prepare_assoc_step1"/> <ok to="copy_wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
<fork name="fork_prepare_assoc_step1"> <fork name="fork_prepare_assoc_step1">
<path start="join_prepare_publication"/> <path start="join_prepare_publication"/>
@ -222,7 +266,8 @@
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/orcid_propagation/publication</arg> <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -249,7 +294,8 @@
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/orcid_propagation/dataset</arg> <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -276,7 +322,8 @@
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/orcid_propagation/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -303,7 +350,8 @@
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/orcid_propagation/software</arg> <arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -9,20 +9,8 @@
<description>the allowed semantics </description> <description>the allowed semantics </description>
</property> </property>
<property> <property>
<name>sparkDriverMemory</name> <name>outputPath</name>
<description>memory for driver process</description> <description>the output path</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>saveGraph</name>
<description>writes new version of the graph after the propagation step</description>
</property> </property>
</parameters> </parameters>
@ -35,21 +23,27 @@
<action name="reset-outputpath"> <action name="reset-outputpath">
<fs> <fs>
<delete path='${workingDir}/preparedInfo'/> <delete path='${workingDir}/preparedInfo'/>
<delete path='${workingDir}/relation'/> <delete path='${outputPath}/relation'/>
<delete path='${workingDir}/dataset'/> <delete path='${outputPath}/dataset'/>
<delete path='${workingDir}/software'/> <delete path='${outputPath}/software'/>
<delete path='${workingDir}/publication'/> <delete path='${outputPath}/publication'/>
<delete path='${workingDir}/otherresearchproduct'/> <delete path='${outputPath}/otherresearchproduct'/>
<delete path='${outputPath}/project'/>
<delete path='${outputPath}/organization'/>
<delete path='${outputPath}/datasource'/>
</fs> </fs>
<ok to="copy_relations"/> <ok to="copy_entities"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name="copy_relations"> <fork name="copy_entities">
<path start="copy_relation"/> <path start="copy_relation"/>
<path start="copy_publication"/> <path start="copy_publication"/>
<path start="copy_dataset"/> <path start="copy_dataset"/>
<path start="copy_orp"/> <path start="copy_orp"/>
<path start="copy_software"/> <path start="copy_software"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork> </fork>
<action name="copy_relation"> <action name="copy_relation">
@ -57,7 +51,7 @@
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${workingDir}/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -68,7 +62,7 @@
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/publication</arg> <arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${workingDir}/publication</arg> <arg>${nameNode}/${outputPath}/publication</arg>
</distcp> </distcp>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -79,7 +73,7 @@
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/dataset</arg> <arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${workingDir}/dataset</arg> <arg>${nameNode}/${outputPath}/dataset</arg>
</distcp> </distcp>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -90,7 +84,7 @@
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg> <arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${workingDir}/otherresearchproduct</arg> <arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp> </distcp>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -101,11 +95,42 @@
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/software</arg> <arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${workingDir}/software</arg> <arg>${nameNode}/${outputPath}/software</arg>
</distcp> </distcp>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy"/>
<error to="Kill"/>
</action>
<join name="wait" to="prepare_project_results_association"/> <join name="wait" to="prepare_project_results_association"/>
@ -156,7 +181,7 @@
</spark-opts> </spark-opts>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg> <arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
</spark> </spark>

View File

@ -23,12 +23,6 @@
"paramDescription": "true if the spark session is managed, false otherwise", "paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false "paramRequired": false
}, },
{
"paramName":"test",
"paramLongName":"isTest",
"paramDescription": "true if it is executing a test",
"paramRequired": false
},
{ {
"paramName": "out", "paramName": "out",
"paramLongName": "outputPath", "paramLongName": "outputPath",

View File

@ -9,12 +9,8 @@
<description>organization community map</description> <description>organization community map</description>
</property> </property>
<property> <property>
<name>writeUpdate</name> <name>outputPath</name>
<description>writes the information found for the update. No double check done if the information is already present</description> <description>the output path</description>
</property>
<property>
<name>saveGraph</name>
<description>writes new version of the graph after the propagation step</description>
</property> </property>
</parameters> </parameters>
@ -26,23 +22,71 @@
<action name="reset-outputpath"> <action name="reset-outputpath">
<fs> <fs>
<delete path='${workingDir}/resulttocommunityorganization_propagation'/> <delete path='${outputPath}/relation'/>
<delete path='${outputPath}/datasource'/>
<delete path='${outputPath}/organization'/>
<delete path='${outputPath}/project'/>
<delete path='${outputPath}/publication'/>
<delete path='${outputPath}/dataset'/>
<delete path='${outputPath}/software'/>
<delete path='${outputPath}/otherresearchproduct'/>
<delete path='${workingDir}/preparedInfo'/>
</fs> </fs>
<ok to="copy_relation"/> <ok to="copy_entities"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${workingDir}/projecttoresult_propagation/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
<ok to="prepare_result_communitylist"/> <ok to="copy_wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="prepare_result_communitylist"/>
<action name="prepare_result_communitylist"> <action name="prepare_result_communitylist">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -100,7 +144,8 @@
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg> <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -127,7 +172,8 @@
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg> <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -154,7 +200,8 @@
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -181,7 +228,8 @@
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg> <arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -275,25 +275,6 @@
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<jar>dhp-propagation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>