updated workflow for generation of Scholix Datasource's to use mdstore transactions

This commit is contained in:
Sandro La Bruzzo 2023-12-18 16:05:35 +01:00
parent 9d39845d1f
commit 37e36baf76
6 changed files with 185 additions and 40 deletions

View File

@ -1,4 +1,4 @@
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sourcePath</name> <name>sourcePath</name>
@ -8,19 +8,40 @@
<name>database</name> <name>database</name>
<description>the PDB Database Working Path</description> <description>the PDB Database Working Path</description>
</property> </property>
<property> <property>
<name>targetPath</name> <name>mdStoreOutputId</name>
<description>the Target Working dir path</description> <description>the identifier of the cleaned MDStore</description>
</property>
<property>
<name>mdStoreManagerURI</name>
<description>the path of the cleaned mdstore</description>
</property> </property>
</parameters> </parameters>
<start to="ConvertDB"/> <start to="StartTransaction"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="StartTransaction">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>NEW_VERSION</arg>
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="ConvertDB"/>
<error to="RollBack"/>
</action>
<action name="ConvertDB"> <action name="ConvertDB">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -41,11 +62,48 @@
<arg>--master</arg><arg>yarn</arg> <arg>--master</arg><arg>yarn</arg>
<arg>--dbPath</arg><arg>${sourcePath}</arg> <arg>--dbPath</arg><arg>${sourcePath}</arg>
<arg>--database</arg><arg>${database}</arg> <arg>--database</arg><arg>${database}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg> <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
</spark> </spark>
<ok to="End"/> <ok to="CommitVersion"/>
<error to="Kill"/> <error to="RollBack"/>
</action> </action>
<end name="End"/> <action name="CommitVersion">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="RollBack">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>ROLLBACK</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="Kill"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app> </workflow-app>

View File

@ -2,5 +2,5 @@
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true}, {"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true},
{"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true}, {"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the OAF target path ", "paramRequired": true} {"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}
] ]

View File

@ -1,5 +1,20 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true}, "paramName": "mt",
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true} "paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source Path",
"paramRequired": true
},
{
"paramName": "mo",
"paramLongName": "mdstoreOutputVersion",
"paramDescription": "the oaf path ",
"paramRequired": true
}
] ]

View File

@ -9,34 +9,26 @@
<description>the Working Path</description> <description>the Working Path</description>
</property> </property>
<property> <property>
<name>targetPath</name> <name>mdStoreOutputId</name>
<description>the OAF MDStore Path</description> <description>the identifier of the cleaned MDStore</description>
</property> </property>
<property> <property>
<name>sparkDriverMemory</name> <name>mdStoreManagerURI</name>
<description>memory for driver process</description> <description>the path of the cleaned mdstore</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property> </property>
<property> <property>
<name>resumeFrom</name> <name>resumeFrom</name>
<value>DownloadEBILinks</value> <value>CreateEBIDataSet</value>
<description>node to start</description> <description>node to start</description>
</property> </property>
</parameters> </parameters>
<start to="resume_from"/> <start to="StartTransaction"/>
<decision name="resume_from"> <decision name="resume_from">
<switch> <switch>
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case> <case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case> <case to="StartTransaction">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
<default to="DownloadEBILinks"/> <default to="DownloadEBILinks"/>
</switch> </switch>
</decision> </decision>
@ -77,9 +69,29 @@
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/> <move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/> <move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
</fs> </fs>
<ok to="CreateEBIDataSet"/> <ok to="StartTransaction"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="StartTransaction">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>NEW_VERSION</arg>
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="CreateEBIDataSet"/>
<error to="RollBack"/>
</action>
<action name="CreateEBIDataSet"> <action name="CreateEBIDataSet">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master> <master>yarn-cluster</master>
@ -95,11 +107,49 @@
${sparkExtraOPT} ${sparkExtraOPT}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg> <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--master</arg><arg>yarn</arg> <arg>--master</arg><arg>yarn</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="CommitVersion">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="RollBack">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>ROLLBACK</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="Kill"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -2,13 +2,15 @@ package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH,MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
object SparkTransformBioDatabaseToOAF { object SparkTransformBioDatabaseToOAF {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
@ -25,8 +27,13 @@ object SparkTransformBioDatabaseToOAF {
val dbPath: String = parser.get("dbPath") val dbPath: String = parser.get("dbPath")
log.info("dbPath: {}", database) log.info("dbPath: {}", database)
val targetPath: String = parser.get("targetPath")
log.info("targetPath: {}", database) val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
@ -43,24 +50,28 @@ object SparkTransformBioDatabaseToOAF {
case "UNIPROT" => case "UNIPROT" =>
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
case "PDB" => case "PDB" =>
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
case "SCHOLIX" => case "SCHOLIX" =>
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
case "CROSSREF_LINKS" => case "CROSSREF_LINKS" =>
CollectionUtils.saveDataset( CollectionUtils.saveDataset(
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
} }
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
val mdStoreSize = df.count
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
} }
} }

View File

@ -9,6 +9,9 @@ import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
object SparkEBILinksToOaf { object SparkEBILinksToOaf {
@ -32,8 +35,13 @@ object SparkEBILinksToOaf {
import spark.implicits._ import spark.implicits._
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath") val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info(s"targetPath -> $targetPath") log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
val ebLinks: Dataset[EBILinkItem] = spark.read val ebLinks: Dataset[EBILinkItem] = spark.read
@ -46,7 +54,10 @@ object SparkEBILinksToOaf {
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) .flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)), .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
val mdStoreSize = df.count
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
} }
} }