2021-12-01 11:10:08 +01:00
<workflow-app name= "BioSchemaHarvester" xmlns= "uri:oozie:workflow:0.5" >
<parameters >
<property >
<name > workingPath</name>
2021-12-09 21:33:49 +01:00
<value > /data/bioschema/ped/</value>
2021-12-01 11:10:08 +01:00
<description > the working path</description>
</property>
<property >
<name > sitemapUrl</name>
2021-12-09 21:33:49 +01:00
<value > https://proteinensemble.org/sitemap2.xml.gz</value>
2021-12-01 11:10:08 +01:00
</property>
<property >
<name > sitemapURLKey</name>
<value > loc</value>
</property>
<property >
<name > dynamic</name>
<value > true</value>
<description > the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
<property >
<name > maxScrapedPages</name>
2021-12-07 14:48:28 +01:00
<value > 10</value>
2021-12-01 11:10:08 +01:00
<description > max number of pages that will be scraped, default: no limit</description>
</property>
2021-12-03 15:44:39 +01:00
<property >
<name > rdfOutput</name>
<value > nquads.seq</value>
<description > rdf output of scraping step</description>
</property>
2021-12-01 11:10:08 +01:00
<property >
2021-12-09 21:33:49 +01:00
<name > scraping_java_opts</name>
2021-12-10 11:30:14 +01:00
<value > -Xmx4g -Dwebdriver.chrome.whitelistedIps=</value>
2021-12-07 14:48:28 +01:00
<description > Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
2021-12-01 11:10:08 +01:00
</property>
</parameters>
<global >
<job-tracker > ${jobTracker}</job-tracker>
<name-node > ${nameNode}</name-node>
</global>
<start to= "ResetWorkingPath" />
<kill name= "Kill" >
<message > Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name= "ResetWorkingPath" >
<fs >
<delete path= '${workingPath}${rdfOutput}' />
</fs>
2021-12-07 14:48:28 +01:00
<ok to= "bmuseScraping" />
2021-12-01 11:10:08 +01:00
<error to= "Kill" />
</action>
<action name= "bmuseScraping" >
<java >
2021-12-07 14:48:28 +01:00
<job-tracker > ${jobTracker}</job-tracker>
2021-12-01 11:10:08 +01:00
<name-node > ${nameNode}</name-node>
2021-12-07 14:48:28 +01:00
<configuration >
<property >
<name > oozie.launcher.mapreduce.user.classpath.first</name>
<value > true</value>
</property>
</configuration>
2021-12-01 11:10:08 +01:00
<main-class > eu.dnetlib.dhp.bmuse.bioschema.ScrapingJob</main-class>
2021-12-09 21:33:49 +01:00
<java-opts > ${scraping_java_opts}</java-opts>
2021-12-01 11:10:08 +01:00
<arg > --nameNode</arg> <arg > ${nameNode}</arg>
<arg > --workingPath</arg> <arg > ${workingPath}</arg>
<arg > --rdfOutput</arg> <arg > ${rdfOutput}</arg>
<arg > --sitemapUrl</arg> <arg > ${sitemapUrl}</arg>
<arg > --sitemapURLKey</arg> <arg > ${sitemapURLKey}</arg>
<arg > --dynamic</arg> <arg > ${dynamic}</arg>
</java>
<ok to= "End" />
<error to= "Kill" />
</action>
<end name= "End" />
</workflow-app>