From 0029b3de330e658da9f408cb2e854414ff3bec5a Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 9 Dec 2021 22:36:05 +0100 Subject: [PATCH] removed unused components --- .../dhp/bmuse/bioschema/ScrapingJob.java | 2 +- .../dhp/bmuse/bioschema/SparkScraper.java | 114 ------------------ .../utils/FunctionalInterfaceSupport.java | 65 ---------- .../dhp/bmuse/utils/SparkSessionSupport.java | 75 ------------ 4 files changed, 1 insertion(+), 255 deletions(-) delete mode 100644 dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/SparkScraper.java delete mode 100644 dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/FunctionalInterfaceSupport.java delete mode 100644 dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/SparkSessionSupport.java diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java index 9dacebc0b..0a1f897f0 100644 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java +++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java @@ -82,7 +82,7 @@ public class ScrapingJob { String nquads; try { String site = u.text(); - logger.info(site + " > parsing"); + logger.debug(site + " > parsing"); nquads = scraper.scrapeUrl(site, scrapingType); final Text value = new Text(nquads); writer.append(key, value); diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/SparkScraper.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/SparkScraper.java deleted file mode 100644 index 651757d21..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/SparkScraper.java +++ /dev/null @@ -1,114 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.bioschema; - -import static eu.dnetlib.dhp.bmuse.utils.SparkSessionSupport.runWithSparkSession; - -import java.util.Objects; -import java.util.Optional; -import java.util.function.Function; -import java.util.stream.Stream; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.util.LongAccumulator; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.bmuse.utils.ArgumentApplicationParser; -import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper; -import eu.dnetlib.dhp.bmuse.utils.FunctionalInterfaceSupport; -import eu.dnetlib.dhp.bmuse.utils.UrlParser; - -public class SparkScraper { - - static Logger logger = LoggerFactory.getLogger(SparkScraper.class); - - public static void main(String[] args) throws Exception { - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkScraper.class - .getResourceAsStream( - "/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json"))); - parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - - final String nameNode = parser.get("nameNode"); - final String workingPath = parser.get("workingPath"); - final String rdfOutput = parser.get("rdfOutput"); - final String sitemapUrl = parser.get("sitemapUrl"); - final String sitemapURLKey = parser.get("sitemapURLKey"); - final String dynamic = parser.get("dynamic"); - final String maxScrapedPages = parser.get("maxScrapedPages"); - Boolean dynamicValue = true; - if (Objects.nonNull(dynamic)) { - dynamicValue = Boolean.parseBoolean(dynamic); - } - final boolean scrapingType = dynamicValue.booleanValue(); - - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - final LongAccumulator scraped = spark.sparkContext().longAccumulator("scraped"); - final LongAccumulator errors = spark.sparkContext().longAccumulator("errors"); - - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - System.setProperty("webdriver.chrome.whitelistedIps", ""); - - BMUSEScraper scraper = new BMUSEScraper(); - String url = sitemapUrl.toLowerCase(); - Elements urls = UrlParser.getSitemapList(url, sitemapURLKey); - long total = urls.size(); - - Path output = new Path( - nameNode - .concat(workingPath) - .concat(rdfOutput)); - try (SequenceFile.Writer writer = SequenceFile - .createWriter( - sc.hadoopConfiguration(), - SequenceFile.Writer.file(output), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class), - SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { - Stream urlStream = null; - if (Objects.nonNull(maxScrapedPages)) { - urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages)); - } else { - urlStream = urls.stream(); - } - urlStream.forEach(u -> { - try { - final Text key = new Text(u.text()); - final Text value = new Text(scraper.scrapeUrl(u.text(), scrapingType)); - writer.append(key, value); - scraped.add(1l); - } catch (Exception e) { - logger.error(u.text(), e); - errors.add(1l); - } - }); - } - - logger - .info( - "Total pages to scrape: " + total + " Scraped: " + scraped.value() + - " Errors: " + errors.value()); - }); - } -} diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/FunctionalInterfaceSupport.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/FunctionalInterfaceSupport.java deleted file mode 100644 index f6e77d6a1..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/FunctionalInterfaceSupport.java +++ /dev/null @@ -1,65 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.utils; - -import java.io.Serializable; -import java.util.function.Consumer; -import java.util.function.Supplier; - -/** Provides serializable and throwing extensions to standard functional interfaces. */ -public class FunctionalInterfaceSupport { - - private FunctionalInterfaceSupport() { - } - - /** - * Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying - * functions externally. - * - * @param - */ - @FunctionalInterface - public interface SerializableConsumer extends Consumer, Serializable { - } - - /** - * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying - * functions externally. - * - * @param - */ - @FunctionalInterface - public interface SerializableSupplier extends Supplier, Serializable { - } - - /** - * Extension of consumer accepting functions throwing an exception. - * - * @param - * @param - */ - @FunctionalInterface - public interface ThrowingConsumer { - void accept(T t) throws E; - } - - /** - * Extension of supplier accepting functions throwing an exception. - * - * @param - * @param - */ - @FunctionalInterface - public interface ThrowingSupplier { - T get() throws E; - } - - /** - * Extension of runnable accepting functions throwing an exception. - * - * @param - */ - @FunctionalInterface - public interface ThrowingRunnable { - void run() throws E; - } -} diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/SparkSessionSupport.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/SparkSessionSupport.java deleted file mode 100644 index 8e7308c44..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/SparkSessionSupport.java +++ /dev/null @@ -1,75 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.utils; - -import java.util.Objects; -import java.util.function.Function; - -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SparkSession; - -/** SparkSession utility methods. */ -public class SparkSessionSupport { - - private SparkSessionSupport() { - } - - /** - * Runs a given function using SparkSession created using default builder and supplied SparkConf. Stops SparkSession - * when SparkSession is managed. Allows to reuse SparkSession created externally. - * - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkSession( - SparkConf conf, Boolean isSparkSessionManaged, - FunctionalInterfaceSupport.ThrowingConsumer fn) { - runWithSparkSession( - c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn); - } - - /** - * Runs a given function using SparkSession created with hive support and using default builder and supplied - * SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally. - * - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkHiveSession( - SparkConf conf, Boolean isSparkSessionManaged, - FunctionalInterfaceSupport.ThrowingConsumer fn) { - runWithSparkSession( - c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(), - conf, - isSparkSessionManaged, - fn); - } - - /** - * Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops - * SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally. - * - * @param sparkSessionBuilder Builder of SparkSession - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkSession( - Function sparkSessionBuilder, - SparkConf conf, - Boolean isSparkSessionManaged, - FunctionalInterfaceSupport.ThrowingConsumer fn) { - SparkSession spark = null; - try { - spark = sparkSessionBuilder.apply(conf); - fn.accept(spark); - } catch (Exception e) { - throw new RuntimeException(e); - } finally { - if (Objects.nonNull(spark) && isSparkSessionManaged) { - spark.stop(); - } - } - } -}