removed unused components

2021-12-09 22:36:05 +01:00 · 2021-12-09 22:36:05 +01:00 · 0029b3de33
parent 4797cc460b
commit 0029b3de33
4 changed files with 1 additions and 255 deletions
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java
@ -82,7 +82,7 @@ public class ScrapingJob {
 				String nquads;
 				try {
 					String site = u.text();
-					logger.info(site + " > parsing");
+					logger.debug(site + " > parsing");
 					nquads = scraper.scrapeUrl(site, scrapingType);
 					final Text value = new Text(nquads);
 					writer.append(key, value);
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/SparkScraper.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/SparkScraper.java
@ -1,114 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.bioschema;
-
-import static eu.dnetlib.dhp.bmuse.utils.SparkSessionSupport.runWithSparkSession;
-
-import java.util.Objects;
-import java.util.Optional;
-import java.util.function.Function;
-import java.util.stream.Stream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SparkSession;
-import org.apache.spark.util.LongAccumulator;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.bmuse.utils.ArgumentApplicationParser;
-import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper;
-import eu.dnetlib.dhp.bmuse.utils.FunctionalInterfaceSupport;
-import eu.dnetlib.dhp.bmuse.utils.UrlParser;
-
-public class SparkScraper {
-
-	static Logger logger = LoggerFactory.getLogger(SparkScraper.class);
-
-	public static void main(String[] args) throws Exception {
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					SparkScraper.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json")));
-		parser.parseArgument(args);
-		Boolean isSparkSessionManaged = Optional
-			.ofNullable(parser.get("isSparkSessionManaged"))
-			.map(Boolean::valueOf)
-			.orElse(Boolean.TRUE);
-
-		final String nameNode = parser.get("nameNode");
-		final String workingPath = parser.get("workingPath");
-		final String rdfOutput = parser.get("rdfOutput");
-		final String sitemapUrl = parser.get("sitemapUrl");
-		final String sitemapURLKey = parser.get("sitemapURLKey");
-		final String dynamic = parser.get("dynamic");
-		final String maxScrapedPages = parser.get("maxScrapedPages");
-		Boolean dynamicValue = true;
-		if (Objects.nonNull(dynamic)) {
-			dynamicValue = Boolean.parseBoolean(dynamic);
-		}
-		final boolean scrapingType = dynamicValue.booleanValue();
-
-		SparkConf conf = new SparkConf();
-		runWithSparkSession(
-			conf,
-			isSparkSessionManaged,
-			spark -> {
-				final LongAccumulator scraped = spark.sparkContext().longAccumulator("scraped");
-				final LongAccumulator errors = spark.sparkContext().longAccumulator("errors");
-
-				JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-
-				System.setProperty("webdriver.chrome.whitelistedIps", "");
-
-				BMUSEScraper scraper = new BMUSEScraper();
-				String url = sitemapUrl.toLowerCase();
-				Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
-				long total = urls.size();
-
-				Path output = new Path(
-					nameNode
-						.concat(workingPath)
-						.concat(rdfOutput));
-				try (SequenceFile.Writer writer = SequenceFile
-					.createWriter(
-						sc.hadoopConfiguration(),
-						SequenceFile.Writer.file(output),
-						SequenceFile.Writer.keyClass(Text.class),
-						SequenceFile.Writer.valueClass(Text.class),
-						SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
-					Stream<Element> urlStream = null;
-					if (Objects.nonNull(maxScrapedPages)) {
-						urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
-					} else {
-						urlStream = urls.stream();
-					}
-					urlStream.forEach(u -> {
-						try {
-							final Text key = new Text(u.text());
-							final Text value = new Text(scraper.scrapeUrl(u.text(), scrapingType));
-							writer.append(key, value);
-							scraped.add(1l);
-						} catch (Exception e) {
-							logger.error(u.text(), e);
-							errors.add(1l);
-						}
-					});
-				}
-
-				logger
-					.info(
-						"Total pages to scrape: " + total + " Scraped: " + scraped.value() +
-							" Errors: " + errors.value());
-			});
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/FunctionalInterfaceSupport.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/FunctionalInterfaceSupport.java
@ -1,65 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.utils;
-
-import java.io.Serializable;
-import java.util.function.Consumer;
-import java.util.function.Supplier;
-
-/** Provides serializable and throwing extensions to standard functional interfaces. */
-public class FunctionalInterfaceSupport {
-
-	private FunctionalInterfaceSupport() {
-	}
-
-	/**
-	 * Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying
-	 * functions externally.
-	 *
-	 * @param <T>
-	 */
-	@FunctionalInterface
-	public interface SerializableConsumer<T> extends Consumer<T>, Serializable {
-	}
-
-	/**
-	 * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
-	 * functions externally.
-	 *
-	 * @param <T>
-	 */
-	@FunctionalInterface
-	public interface SerializableSupplier<T> extends Supplier<T>, Serializable {
-	}
-
-	/**
-	 * Extension of consumer accepting functions throwing an exception.
-	 *
-	 * @param <T>
-	 * @param <E>
-	 */
-	@FunctionalInterface
-	public interface ThrowingConsumer<T, E extends Exception> {
-		void accept(T t) throws E;
-	}
-
-	/**
-	 * Extension of supplier accepting functions throwing an exception.
-	 *
-	 * @param <T>
-	 * @param <E>
-	 */
-	@FunctionalInterface
-	public interface ThrowingSupplier<T, E extends Exception> {
-		T get() throws E;
-	}
-
-	/**
-	 * Extension of runnable accepting functions throwing an exception.
-	 *
-	 * @param <E>
-	 */
-	@FunctionalInterface
-	public interface ThrowingRunnable<E extends Exception> {
-		void run() throws E;
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/SparkSessionSupport.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/SparkSessionSupport.java
@ -1,75 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.utils;
-
-import java.util.Objects;
-import java.util.function.Function;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SparkSession;
-
-/** SparkSession utility methods. */
-public class SparkSessionSupport {
-
-	private SparkSessionSupport() {
-	}
-
-	/**
-	 * Runs a given function using SparkSession created using default builder and supplied SparkConf. Stops SparkSession
-	 * when SparkSession is managed. Allows to reuse SparkSession created externally.
-	 *
-	 * @param conf SparkConf instance
-	 * @param isSparkSessionManaged When true will stop SparkSession
-	 * @param fn Consumer to be applied to constructed SparkSession
-	 */
-	public static void runWithSparkSession(
-		SparkConf conf, Boolean isSparkSessionManaged,
-		FunctionalInterfaceSupport.ThrowingConsumer<SparkSession, Exception> fn) {
-		runWithSparkSession(
-			c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn);
-	}
-
-	/**
-	 * Runs a given function using SparkSession created with hive support and using default builder and supplied
-	 * SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally.
-	 *
-	 * @param conf SparkConf instance
-	 * @param isSparkSessionManaged When true will stop SparkSession
-	 * @param fn Consumer to be applied to constructed SparkSession
-	 */
-	public static void runWithSparkHiveSession(
-		SparkConf conf, Boolean isSparkSessionManaged,
-		FunctionalInterfaceSupport.ThrowingConsumer<SparkSession, Exception> fn) {
-		runWithSparkSession(
-			c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(),
-			conf,
-			isSparkSessionManaged,
-			fn);
-	}
-
-	/**
-	 * Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops
-	 * SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally.
-	 *
-	 * @param sparkSessionBuilder Builder of SparkSession
-	 * @param conf SparkConf instance
-	 * @param isSparkSessionManaged When true will stop SparkSession
-	 * @param fn Consumer to be applied to constructed SparkSession
-	 */
-	public static void runWithSparkSession(
-		Function<SparkConf, SparkSession> sparkSessionBuilder,
-		SparkConf conf,
-		Boolean isSparkSessionManaged,
-		FunctionalInterfaceSupport.ThrowingConsumer<SparkSession, Exception> fn) {
-		SparkSession spark = null;
-		try {
-			spark = sparkSessionBuilder.apply(conf);
-			fn.accept(spark);
-		} catch (Exception e) {
-			throw new RuntimeException(e);
-		} finally {
-			if (Objects.nonNull(spark) && isSparkSessionManaged) {
-				spark.stop();
-			}
-		}
-	}
-}