forked from D-Net/dnet-hadoop
updated the transformation Baseline workflow to include mdstore rollback/commit action
This commit is contained in:
parent
01ce0b9c76
commit
9d342a47da
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>baselineWorkingPath</name>
|
||||
|
@ -9,8 +9,12 @@
|
|||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>The target path</description>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>skipUpdate</name>
|
||||
|
@ -19,12 +23,31 @@
|
|||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ConvertDataset"/>
|
||||
<start to="StartTransaction"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="ConvertDataset"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="ConvertDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -43,16 +66,52 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${baselineWorkingPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
<arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
|
||||
</spark>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -2,9 +2,12 @@ package eu.dnetlib.dhp.sx.bio.ebi
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -164,11 +167,15 @@ object SparkCreateBaselineDataFrame {
|
|||
val workingPath = parser.get("workingPath")
|
||||
log.info("workingPath: {}", workingPath)
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info("targetPath: {}", targetPath)
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
||||
|
||||
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info("outputBasePath: {}", outputBasePath)
|
||||
|
||||
val hdfsServerUri = parser.get("hdfsServerUri")
|
||||
log.info("hdfsServerUri: {}", targetPath)
|
||||
log.info("hdfsServerUri: {}", hdfsServerUri)
|
||||
|
||||
val skipUpdate = parser.get("skipUpdate")
|
||||
log.info("skipUpdate: {}", skipUpdate)
|
||||
|
@ -216,8 +223,11 @@ object SparkCreateBaselineDataFrame {
|
|||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
.as[Oaf]
|
||||
.filter(p => p != null),
|
||||
targetPath
|
||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||
)
|
||||
|
||||
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
||||
val mdStoreSize = df.count
|
||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue