modified wf configuration; added bmuse scraping on a spark action

2021-12-03 15:44:39 +01:00 · 2021-12-03 15:44:39 +01:00 · 83d5e165a7
parent ff530fdcb4
commit 83d5e165a7
7 changed files with 169 additions and 40 deletions
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java
@ -70,6 +70,8 @@ public class ScrapingJob {
 		Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
 		long total = urls.size();

+		System.setProperty("webdriver.chrome.whitelistedIps", "");
+
 		Path output = new Path(
 			nameNode
 				.concat(workingPath)
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/SparkScraper.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/SparkScraper.java
@ -0,0 +1,111 @@
+
+package eu.dnetlib.dhp.bmuse.bioschema;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.Objects;
+import java.util.Optional;
+import java.util.stream.Stream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.util.LongAccumulator;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper;
+import eu.dnetlib.dhp.bmuse.utils.UrlParser;
+
+public class SparkScraper {
+
+	static Logger logger = LoggerFactory.getLogger(SparkScraper.class);
+
+	public static void main(String[] args) throws Exception {
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					SparkScraper.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json")));
+		parser.parseArgument(args);
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+
+		final String nameNode = parser.get("nameNode");
+		final String workingPath = parser.get("workingPath");
+		final String rdfOutput = parser.get("rdfOutput");
+		final String sitemapUrl = parser.get("sitemapUrl");
+		final String sitemapURLKey = parser.get("sitemapURLKey");
+		final String dynamic = parser.get("dynamic");
+		final String maxScrapedPages = parser.get("maxScrapedPages");
+		Boolean dynamicValue = true;
+		if (Objects.nonNull(dynamic)) {
+			dynamicValue = Boolean.parseBoolean(dynamic);
+		}
+		final boolean scrapingType = dynamicValue.booleanValue();
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> {
+				final LongAccumulator scraped = spark.sparkContext().longAccumulator("scraped");
+				final LongAccumulator errors = spark.sparkContext().longAccumulator("errors");
+
+				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+				System.setProperty("webdriver.chrome.whitelistedIps", "");
+
+				BMUSEScraper scraper = new BMUSEScraper();
+				String url = sitemapUrl.toLowerCase();
+				Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
+				long total = urls.size();
+
+				Path output = new Path(
+					nameNode
+						.concat(workingPath)
+						.concat(rdfOutput));
+				try (SequenceFile.Writer writer = SequenceFile
+					.createWriter(
+						sc.hadoopConfiguration(),
+						SequenceFile.Writer.file(output),
+						SequenceFile.Writer.keyClass(Text.class),
+						SequenceFile.Writer.valueClass(Text.class),
+						SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
+					Stream<Element> urlStream = null;
+					if (Objects.nonNull(maxScrapedPages)) {
+						urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
+					} else {
+						urlStream = urls.stream();
+					}
+					urlStream.forEach(u -> {
+						try {
+							final Text key = new Text(u.text());
+							final Text value = new Text(scraper.scrapeUrl(u.text(), scrapingType));
+							writer.append(key, value);
+							scraped.add(1l);
+						} catch (Exception e) {
+							logger.error(u.text(), e);
+							errors.add(1l);
+						}
+					});
+				}
+
+				logger
+					.info(
+						"Total pages to scrape: " + total + " Scraped: " + scraped.value() +
+							" Errors: " + errors.value());
+			});
+	}
+}
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java
@ -53,7 +53,7 @@ public class BMUSEScraper extends ScraperFilteredCore {
 			logger.error(e.toString());
 			return e.getMessage();
 		}
-
+		logger.info("HTML: " + html);
 		DocumentSource source = new StringDocumentSource(html, url);
 		IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());

--- a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml
@ -2,22 +2,22 @@

    <!-- OCEAN  -->

-<!--    <property>-->
-<!--        <name>jobTracker</name>-->
-<!--        <value>yarnRM</value>-->
-<!--    </property>-->
-<!--    <property>-->
-<!--        <name>nameNode</name>-->
-<!--        <value>hdfs://nameservice1</value>-->
-<!--    </property>-->
-<!--    <property>-->
-<!--        <name>hive_metastore_uris</name>-->
-<!--        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>-->
-<!--    </property>-->
-<!--    <property>-->
-<!--        <name>spark2YarnHistoryServerAddress</name>-->
-<!--        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>-->
-<!--    </property>-->
+    <!--    <property>-->
+    <!--        <name>jobTracker</name>-->
+    <!--        <value>yarnRM</value>-->
+    <!--    </property>-->
+    <!--    <property>-->
+    <!--        <name>nameNode</name>-->
+    <!--        <value>hdfs://nameservice1</value>-->
+    <!--    </property>-->
+    <!--    <property>-->
+    <!--        <name>hive_metastore_uris</name>-->
+    <!--        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>-->
+    <!--    </property>-->
+    <!--    <property>-->
+    <!--        <name>spark2YarnHistoryServerAddress</name>-->
+    <!--        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>-->
+    <!--    </property>-->


    <!-- GARR  -->
@ -38,13 +38,10 @@
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
    </property>
-
-
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
-
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
--- a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml
@ -2,17 +2,12 @@
    <parameters>
        <property>
            <name>workingPath</name>
-            <value>/data/bioschema/disprot/</value>
+            <value>/data/bioschema/mobidb/</value>
            <description>the working path</description>
        </property>
-        <property>
-            <name>rdfOutput</name>
-            <value>nquads.seq</value>
-            <description>rdf output of scraping step</description>
-        </property>
        <property>
            <name>sitemapUrl</name>
-            <value>https://disprot.org/sitemap2.xml.gz</value>
+            <value>https://mobidb.org/sitemap2.xml.gz</value>
        </property>
        <property>
            <name>sitemapURLKey</name>
@ -28,13 +23,18 @@
            <value>100</value>
            <description>max number of pages that will be scraped, default: no limit</description>
        </property>
+        <property>
+            <name>rdfOutput</name>
+            <value>nquads.seq</value>
+            <description>rdf output of scraping step</description>
+        </property>
        <property>
            <name>oozie.launcher.mapreduce.map.java.opts</name>
            <value>-Xmx4g</value>
        </property>
        <property>
-            <name>spark2RdfConversionMaxExecutors</name>
-            <value>50</value>
+            <name>spark2MaxExecutors</name>
+            <value>1</value>
        </property>
        <property>
            <name>sparkDriverMemory</name>
@ -43,7 +43,7 @@
        </property>
        <property>
            <name>sparkExecutorMemory</name>
-            <value>2G</value>
+            <value>4G</value>
            <description>memory for individual executor</description>
        </property>
        <property>
@ -75,7 +75,7 @@
        <fs>
            <delete path='${workingPath}${rdfOutput}'/>
        </fs>
-        <ok to="bmuseScraping"/>
+        <ok to="bmuseScrapingSpark"/>
        <error to="Kill"/>
    </action>

@ -94,5 +94,32 @@
        <error to="Kill"/>
    </action>

+    <action name="bmuseScrapingSpark">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>bmuseScrapingSpark</name>
+            <class>eu.dnetlib.dhp.bmuse.bioschema.SparkScraper</class>
+            <jar>dhp-bmuse-${projectVersion}.jar</jar>
+            <spark-opts>
+                --conf spark.dynamicAllocation.enabled=true
+                --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--nameNode</arg><arg>${nameNode}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--rdfOutput</arg><arg>${rdfOutput}</arg>
+            <arg>--sitemapUrl</arg><arg>${sitemapUrl}</arg>
+            <arg>--sitemapURLKey</arg><arg>${sitemapURLKey}</arg>
+            <arg>--dynamic</arg><arg>${dynamic}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/config-default.xml
@ -24,7 +24,7 @@

    <property>
        <name>jobTracker</name>
-        <value>yarn</value>
+        <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
    </property>
    <property>
        <name>nameNode</name>
@ -38,13 +38,10 @@
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
    </property>
-
-
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
-
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
--- a/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/workflow.xml
@ -52,11 +52,6 @@
        </property>
    </parameters>

-    <global>
-        <job-tracker>${jobTracker}</job-tracker>
-        <name-node>${nameNode}</name-node>
-    </global>
-
    <start to="ResetWorkingPath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>