2021-12-01 11:10:08 +01:00
<workflow-app name= "BioSchemaHarvester" xmlns= "uri:oozie:workflow:0.5" >
<parameters >
<property >
<name > workingPath</name>
2021-12-03 15:44:39 +01:00
<value > /data/bioschema/mobidb/</value>
2021-12-01 11:10:08 +01:00
<description > the working path</description>
</property>
<property >
<name > sitemapUrl</name>
2021-12-03 15:44:39 +01:00
<value > https://mobidb.org/sitemap2.xml.gz</value>
2021-12-01 11:10:08 +01:00
</property>
<property >
<name > sitemapURLKey</name>
<value > loc</value>
</property>
<property >
<name > dynamic</name>
<value > true</value>
<description > the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
<property >
<name > maxScrapedPages</name>
<value > 100</value>
<description > max number of pages that will be scraped, default: no limit</description>
</property>
2021-12-03 15:44:39 +01:00
<property >
<name > rdfOutput</name>
<value > nquads.seq</value>
<description > rdf output of scraping step</description>
</property>
2021-12-01 11:10:08 +01:00
<property >
<name > oozie.launcher.mapreduce.map.java.opts</name>
<value > -Xmx4g</value>
</property>
<property >
2021-12-03 15:44:39 +01:00
<name > spark2MaxExecutors</name>
<value > 1</value>
2021-12-01 11:10:08 +01:00
</property>
<property >
<name > sparkDriverMemory</name>
<value > 7G</value>
<description > memory for driver process</description>
</property>
<property >
<name > sparkExecutorMemory</name>
2021-12-03 15:44:39 +01:00
<value > 4G</value>
2021-12-01 11:10:08 +01:00
<description > memory for individual executor</description>
</property>
<property >
<name > spark2ExtraListeners</name>
<value > com.cloudera.spark.lineage.NavigatorAppListener</value>
<description > spark 2.* extra listeners classname</description>
</property>
<property >
<name > spark2YarnHistoryServerAddress</name>
<description > spark 2.* yarn history server address</description>
</property>
<property >
<name > spark2EventLogDir</name>
<description > spark 2.* event log dir location</description>
</property>
</parameters>
<global >
<job-tracker > ${jobTracker}</job-tracker>
<name-node > ${nameNode}</name-node>
</global>
<start to= "ResetWorkingPath" />
<kill name= "Kill" >
<message > Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name= "ResetWorkingPath" >
<fs >
<delete path= '${workingPath}${rdfOutput}' />
</fs>
2021-12-03 15:44:39 +01:00
<ok to= "bmuseScrapingSpark" />
2021-12-01 11:10:08 +01:00
<error to= "Kill" />
</action>
<action name= "bmuseScraping" >
<java >
<name-node > ${nameNode}</name-node>
<main-class > eu.dnetlib.dhp.bmuse.bioschema.ScrapingJob</main-class>
<arg > --nameNode</arg> <arg > ${nameNode}</arg>
<arg > --workingPath</arg> <arg > ${workingPath}</arg>
<arg > --rdfOutput</arg> <arg > ${rdfOutput}</arg>
<arg > --sitemapUrl</arg> <arg > ${sitemapUrl}</arg>
<arg > --sitemapURLKey</arg> <arg > ${sitemapURLKey}</arg>
<arg > --dynamic</arg> <arg > ${dynamic}</arg>
</java>
<ok to= "End" />
<error to= "Kill" />
</action>
2021-12-03 15:44:39 +01:00
<action name= "bmuseScrapingSpark" >
<spark xmlns= "uri:oozie:spark-action:0.2" >
<master > yarn-cluster</master>
<mode > cluster</mode>
<name > bmuseScrapingSpark</name>
<class > eu.dnetlib.dhp.bmuse.bioschema.SparkScraper</class>
<jar > dhp-bmuse-${projectVersion}.jar</jar>
<spark-opts >
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg > --nameNode</arg> <arg > ${nameNode}</arg>
<arg > --workingPath</arg> <arg > ${workingPath}</arg>
<arg > --rdfOutput</arg> <arg > ${rdfOutput}</arg>
<arg > --sitemapUrl</arg> <arg > ${sitemapUrl}</arg>
<arg > --sitemapURLKey</arg> <arg > ${sitemapURLKey}</arg>
<arg > --dynamic</arg> <arg > ${dynamic}</arg>
</spark>
<ok to= "End" />
<error to= "Kill" />
</action>
2021-12-01 11:10:08 +01:00
<end name= "End" />
</workflow-app>