updated dhp-rdfconverter version to 1.2.5-SNAPSHOT

2022-05-11 11:20:16 +02:00 · 2022-05-11 11:20:16 +02:00 · baa312f256
parent 6fa9624c29
commit baa312f256
16 changed files with 1 additions and 823 deletions
--- a/dhp-workflows/dhp-bmuse/pom.xml
+++ b/dhp-workflows/dhp-bmuse/pom.xml
@ -1,96 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-    <modelVersion>4.0.0</modelVersion>
-    <parent>
-        <groupId>eu.dnetlib.dhp</groupId>
-        <artifactId>dhp-workflows</artifactId>
-        <version>1.2.4-SNAPSHOT</version>
-    </parent>
-    <artifactId>dhp-bmuse</artifactId>
-
-    <dependencies>
-        <dependency>
-            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-core_2.11</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.spark</groupId>
-            <artifactId>spark-sql_2.11</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>hwu.elixir</groupId>
-            <artifactId>bmuse-core</artifactId>
-            <version>0.5.4</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.apache.any23</groupId>
-            <artifactId>apache-any23-core</artifactId>
-            <version>2.3</version>
-        </dependency>
-        <dependency>
-            <groupId>org.eclipse.rdf4j</groupId>
-            <artifactId>rdf4j-rio-rdfxml</artifactId>
-            <version>3.7.1</version>
-        </dependency>
-        <dependency>
-            <groupId>org.eclipse.rdf4j</groupId>
-            <artifactId>rdf4j-model</artifactId>
-            <version>3.7.1</version>
-        </dependency>
-        <!-- rdf 2.5.4 to 3.7.1-->
-        <dependency>
-            <groupId>org.jsoup</groupId>
-            <artifactId>jsoup</artifactId>
-            <version>1.13.1</version>
-        </dependency>
-        <dependency>
-            <groupId>org.seleniumhq.selenium</groupId>
-            <artifactId>selenium-java</artifactId>
-            <version>3.141.59</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-io</groupId>
-            <artifactId>commons-io</artifactId>
-            <version>2.6</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-validator</groupId>
-            <artifactId>commons-validator</artifactId>
-            <version>1.6</version>
-        </dependency>
-
-        <dependency>
-            <groupId>com.google.guava</groupId>
-            <artifactId>guava</artifactId>
-            <version>22.0</version>
-        </dependency>
-        <dependency>
-            <groupId>com.squareup.okhttp3</groupId>
-            <artifactId>okhttp</artifactId>
-            <version>3.11.0</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-compress</artifactId>
-            <version>1.18</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-            <version>2.9.6</version>
-        </dependency>
-
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-annotations</artifactId>
-            <version>2.9.6</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-            <version>2.9.6</version>
-        </dependency>
-    </dependencies>
-
-</project>
--- a/dhp-workflows/dhp-bmuse/sitemap.txt
+++ b/dhp-workflows/dhp-bmuse/sitemap.txt
@ -1,62 +0,0 @@
-https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612
-
-PED
-<property>
-    <name>workingPath</name>
-    <value>/data/bioschema/ped/</value>
-    <description>the working path</description>
-</property>
-<property>
-    <name>sitemapUrl</name>
-    <value>https://proteinensemble.org/sitemap2.xml.gz</value>
-</property>
-<property>
-    <name>sitemapURLKey</name>
-    <value>loc</value>
-</property>
-<property>
-    <name>dynamic</name>
-    <value>true</value>
-    <description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
-</property>
-
-DISPROT
- <property>
-    <name>workingPath</name>
-    <value>/data/bioschema/disprot/</value>
-    <description>the working path</description>
-</property>
-<property>
-    <name>sitemapUrl</name>
-    <value>https://disprot.org/sitemap2.xml.gz</value>
-</property>
-<property>
-    <name>sitemapURLKey</name>
-    <value>loc</value>
-</property>
-<property>
-    <name>dynamic</name>
-    <value>true</value>
-    <description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
-</property>
-
-MOBIDB
- <property>
-    <name>workingPath</name>
-    <value>/data/bioschema/mobidb/</value>
-    <description>the working path</description>
-</property>
-<property>
-    <name>sitemapUrl</name>
-    <value>https://mobidb.org/sitemap2.xml.gz</value>
-</property>
-<property>
-    <name>sitemapURLKey</name>
-    <value>loc</value>
-</property>
-<property>
-    <name>dynamic</name>
-    <value>true</value>
-    <description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
-</property>
-<property>
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java
@ -1,113 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.bioschema;
-
-import java.text.SimpleDateFormat;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Objects;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.GzipCodec;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.bmuse.utils.ArgumentApplicationParser;
-import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper;
-import eu.dnetlib.dhp.bmuse.utils.UrlParser;
-
-public class ScrapingJob {
-
-	static Logger logger = LoggerFactory.getLogger(ScrapingJob.class);
-
-	public static void main(String[] args) throws Exception {
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					ScrapingJob.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json")));
-		parser.parseArgument(args);
-
-		final String nameNode = parser.get("nameNode");
-		final String workingPath = parser.get("workingPath");
-		final String rdfOutput = parser.get("rdfOutput");
-		final String sitemapUrl = parser.get("sitemapUrl");
-		final String sitemapURLKey = parser.get("sitemapURLKey");
-		final String dynamic = parser.get("dynamic");
-		final String maxScrapedPages = parser.get("maxScrapedPages");
-		Boolean dynamicValue = true;
-		if (Objects.nonNull(dynamic)) {
-			dynamicValue = Boolean.parseBoolean(dynamic);
-		}
-		final boolean scrapingType = dynamicValue.booleanValue();
-
-		logger
-			.info(
-				"*************************** STARTING_SCRAPE");
-
-		BMUSEScraper scraper = new BMUSEScraper();
-		String url = sitemapUrl.toLowerCase();
-		Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
-
-		Path output = new Path(
-			nameNode
-				.concat(workingPath)
-				.concat(rdfOutput));
-		Configuration conf = getHadoopConfiguration(nameNode);
-		try (SequenceFile.Writer writer = SequenceFile
-			.createWriter(
-				conf,
-				SequenceFile.Writer.file(output),
-				SequenceFile.Writer.keyClass(Text.class),
-				SequenceFile.Writer.valueClass(Text.class),
-				SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
-			Stream<Element> urlStream = null;
-			if (Objects.nonNull(maxScrapedPages)) {
-				urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
-			} else {
-				urlStream = urls.stream();
-			}
-			List<Element> sites = urlStream.collect(Collectors.toList());
-			logger.info("Pages available for scraping: " + sites.size());
-			sites.forEach(u -> {
-				final Text key = new Text(u.text());
-				String nquads;
-				try {
-					String site = u.text();
-					logger.debug(site + " > parsing");
-					nquads = scraper.scrapeUrl(site, scrapingType);
-					final Text value = new Text(nquads);
-					writer.append(key, value);
-				} catch (Throwable t) {
-					logger.error(u.text() + " -> ", t);
-				}
-			});
-		}
-
-		logger
-			.info(
-				"*************************** ENDING_SCRAPE: ");
-	}
-
-	public static Configuration getHadoopConfiguration(String nameNode) {
-		// ====== Init HDFS File System Object
-		Configuration conf = new Configuration();
-		// Set FileSystem URI
-		conf.set("fs.defaultFS", nameNode);
-		// Because of Maven
-		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
-		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
-
-		System.setProperty("hadoop.home.dir", "/");
-		return conf;
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/ArgumentApplicationParser.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/ArgumentApplicationParser.java
@ -1,94 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.utils;
-
-import java.io.*;
-import java.util.*;
-import java.util.zip.GZIPInputStream;
-import java.util.zip.GZIPOutputStream;
-
-import org.apache.commons.cli.*;
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.io.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-public class ArgumentApplicationParser implements Serializable {
-
-	private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
-
-	private final Options options = new Options();
-	private final Map<String, String> objectMap = new HashMap<>();
-
-	private final List<String> compressedValues = new ArrayList<>();
-
-	public ArgumentApplicationParser(final String json_configuration) throws IOException {
-		final ObjectMapper mapper = new ObjectMapper();
-		final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
-		createOptionMap(configuration);
-	}
-
-	public ArgumentApplicationParser(final OptionsParameter[] configuration) {
-		createOptionMap(configuration);
-	}
-
-	private void createOptionMap(final OptionsParameter[] configuration) {
-		Arrays
-			.stream(configuration)
-			.map(
-				conf -> {
-					final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
-					o.setLongOpt(conf.getParamLongName());
-					o.setRequired(conf.isParamRequired());
-					if (conf.isCompressed()) {
-						compressedValues.add(conf.getParamLongName());
-					}
-					return o;
-				})
-			.forEach(options::addOption);
-	}
-
-	public static String decompressValue(final String abstractCompressed) {
-		try {
-			byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
-			GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
-			final StringWriter stringWriter = new StringWriter();
-			IOUtils.copy(gis, stringWriter);
-			return stringWriter.toString();
-		} catch (IOException e) {
-			log.error("Wrong value to decompress: {}", abstractCompressed);
-			throw new IllegalArgumentException(e);
-		}
-	}
-
-	public static String compressArgument(final String value) throws IOException {
-		ByteArrayOutputStream out = new ByteArrayOutputStream();
-		GZIPOutputStream gzip = new GZIPOutputStream(out);
-		gzip.write(value.getBytes());
-		gzip.close();
-		return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
-	}
-
-	public void parseArgument(final String[] args) throws ParseException {
-		CommandLineParser parser = new BasicParser();
-		CommandLine cmd = parser.parse(options, args);
-		Arrays
-			.stream(cmd.getOptions())
-			.forEach(
-				it -> objectMap
-					.put(
-						it.getLongOpt(),
-						compressedValues.contains(it.getLongOpt())
-							? decompressValue(it.getValue())
-							: it.getValue()));
-	}
-
-	public String get(final String key) {
-		return objectMap.get(key);
-	}
-
-	public Map<String, String> getObjectMap() {
-		return objectMap;
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java
@ -1,91 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.utils;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.nio.charset.StandardCharsets;
-
-import org.apache.any23.Any23;
-import org.apache.any23.extractor.ExtractionException;
-import org.apache.any23.source.DocumentSource;
-import org.apache.any23.source.StringDocumentSource;
-import org.apache.any23.writer.NTriplesWriter;
-import org.apache.any23.writer.TripleHandler;
-import org.apache.any23.writer.TripleHandlerException;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.output.ByteArrayOutputStream;
-import org.eclipse.rdf4j.model.IRI;
-import org.eclipse.rdf4j.model.Model;
-import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
-import org.eclipse.rdf4j.rio.RDFFormat;
-import org.eclipse.rdf4j.rio.Rio;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import hwu.elixir.scrape.exceptions.*;
-import hwu.elixir.scrape.scraper.ScraperFilteredCore;
-
-public class BMUSEScraper extends ScraperFilteredCore {
-
-	private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());
-
-	public String scrapeUrl(String url, Boolean dynamic) throws Exception {
-		logger.debug(url + " > scraping");
-		url = fixURL(url);
-
-		String html = "";
-		// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
-		// (dynamic and static respectively)
-
-		if (dynamic) {
-			html = wrapHTMLExtraction(url);
-		} else {
-			html = wrapHTMLExtractionStatic(url);
-		}
-
-		if (html == null || html.contentEquals(""))
-			throw new Exception("empty html");
-
-		html = injectId(html, url);
-
-		logger.debug(url + " > html scraped from " + url);
-		DocumentSource source = new StringDocumentSource(html, url);
-		String n3 = html2Triples(source, url);
-		if (n3 == null) {
-			throw new MissingMarkupException(url);
-		}
-
-		logger.debug(url + " > processing triples");
-		IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
-		Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
-		if (updatedModel == null) {
-			throw new Exception("rdf model null");
-		}
-
-		logger.debug(url + " > generating nquads");
-		try (StringWriter jsonLDWriter = new StringWriter()) {
-			Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
-			logger.debug(url + " > nquads generated");
-			return jsonLDWriter.toString();
-		} catch (Exception e) {
-			throw e;
-		}
-	}
-
-	private String html2Triples(DocumentSource source, String url) throws Exception {
-		Any23 runner = new Any23();
-		try (ByteArrayOutputStream out = new ByteArrayOutputStream();
-			TripleHandler handler = new NTriplesWriter(out);) {
-			runner.extract(source, handler);
-			return out.toString("UTF-8");
-		} catch (ExtractionException e) {
-			logger.error("Cannot extract triples", e);
-		} catch (IOException e1) {
-			logger.error(" IO error whilst extracting triples", e1);
-		} catch (TripleHandlerException e2) {
-			logger.error("TripleHanderException", e2);
-		}
-		return null;
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/OptionsParameter.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/OptionsParameter.java
@ -1,35 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.utils;
-
-public class OptionsParameter {
-
-	private String paramName;
-	private String paramLongName;
-	private String paramDescription;
-	private boolean paramRequired;
-	private boolean compressed;
-
-	public String getParamName() {
-		return paramName;
-	}
-
-	public String getParamLongName() {
-		return paramLongName;
-	}
-
-	public String getParamDescription() {
-		return paramDescription;
-	}
-
-	public boolean isParamRequired() {
-		return paramRequired;
-	}
-
-	public boolean isCompressed() {
-		return compressed;
-	}
-
-	public void setCompressed(boolean compressed) {
-		this.compressed = compressed;
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java
@ -1,65 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.utils;
-
-import java.io.IOException;
-
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.select.Elements;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import hwu.elixir.utils.Helpers;
-
-public class UrlParser {
-
-	private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
-
-	public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
-
-		Document doc = new Document(url);
-		Document urlSitemapListsNested;
-		Elements elements = new Elements();
-		Elements sitemaps = new Elements();
-		boolean sitemapindex = false;
-		boolean urlset = false;
-
-		try {
-			int urlLength = url.length();
-			logger.info("parse sitemap list");
-			String sitemapExt = url.substring(urlLength - 3, urlLength);
-			if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
-				logger.info("compressed sitemap");
-				byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
-				doc = Helpers.gzipFileDecompression(bytes);
-			} else {
-				doc = Jsoup.connect(url).maxBodySize(0).get();
-			}
-
-		} catch (IOException e) {
-			logger.error("Jsoup parsing exception: " + e.getMessage());
-		}
-
-		try {
-
-			elements = doc.select(sitemapURLKey);
-
-			// check the html if it is a sitemapindex or a urlset
-			sitemapindex = doc.outerHtml().contains("sitemapindex");
-			urlset = doc.outerHtml().contains("urlset");
-		} catch (NullPointerException e) {
-			logger.error(e.getMessage());
-		}
-
-		if (sitemapindex) {
-			// if sitemapindex get the loc of all the sitemaps
-			// added warning for sitemap index files
-			logger
-				.warn(
-					"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
-			sitemaps = doc.select(sitemapURLKey);
-		}
-
-		return elements;
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json
@ -1,44 +0,0 @@
-[
-  {
-    "paramName": "n",
-    "paramLongName": "nameNode",
-    "paramDescription": "the Name Node URI",
-    "paramRequired": true
-  },
-  {
-    "paramName": "w",
-    "paramLongName": "workingPath",
-    "paramDescription": "the working path",
-    "paramRequired": true
-  },
-  {
-    "paramName": "r",
-    "paramLongName": "rdfOutput",
-    "paramDescription": "the working path",
-    "paramRequired": true
-  },
-  {
-    "paramName": "u",
-    "paramLongName": "sitemapUrl",
-    "paramDescription": "the sitemap url",
-    "paramRequired": true
-  },
-  {
-    "paramName": "k",
-    "paramLongName": "sitemapURLKey",
-    "paramDescription": "the sitemap file contains a list of xml entries, each one has a tag identified with sitemapURLKey with the url as value",
-    "paramRequired": true
-  },
-  {
-    "paramName": "d",
-    "paramLongName": "dynamic",
-    "paramDescription": "the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)",
-    "paramRequired": false
-  },
-  {
-    "paramName": "m",
-    "paramLongName": "maxScrapedPages",
-    "paramDescription": "max number of pages that will be scraped, default: no limit",
-    "paramRequired": false
-  }
-]
--- a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml
@ -1,22 +0,0 @@
-<configuration>
-    <property>
-        <name>jobTracker</name>
-        <value>yarn</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
-    </property>
-    <property>
-        <name>oozie.launcher.mapreduce.user.classpath.first</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-</configuration>
--- a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml
@ -1,81 +0,0 @@
-<workflow-app name="BioSchemaHarvester" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>workingPath</name>
-            <value>/data/bioschema/mobidb/</value>
-            <description>the working path</description>
-        </property>
-        <property>
-            <name>sitemapUrl</name>
-            <value>https://mobidb.org/sitemap2.xml.gz</value>
-        </property>
-        <property>
-            <name>sitemapURLKey</name>
-            <value>loc</value>
-        </property>
-        <property>
-            <name>dynamic</name>
-            <value>true</value>
-            <description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
-        </property>
-        <property>
-            <name>maxScrapedPages</name>
-            <value>5</value>
-            <description>max number of pages that will be scraped, default: no limit</description>
-        </property>
-        <property>
-            <name>rdfOutput</name>
-            <value>nquads.seq</value>
-            <description>rdf output of scraping step</description>
-        </property>
-        <property>
-            <name>scraping_java_opts</name>
-            <value>-Xmx4g -Dwebdriver.chrome.whitelistedIps=</value>
-            <description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
-        </property>
-    </parameters>
-
-    <global>
-        <job-tracker>${jobTracker}</job-tracker>
-        <name-node>${nameNode}</name-node>
-    </global>
-
-    <start to="ResetWorkingPath"/>
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <action name="ResetWorkingPath">
-        <fs>
-            <delete path='${workingPath}${rdfOutput}'/>
-        </fs>
-        <ok to="bmuseScraping"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="bmuseScraping">
-        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <configuration>
-                <property>
-                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
-                    <value>true</value>
-                </property>
-            </configuration>
-            <main-class>eu.dnetlib.dhp.bmuse.bioschema.ScrapingJob</main-class>
-            <java-opts>${scraping_java_opts}</java-opts>
-            <arg>--nameNode</arg><arg>${nameNode}</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--rdfOutput</arg><arg>${rdfOutput}</arg>
-            <arg>--sitemapUrl</arg><arg>${sitemapUrl}</arg>
-            <arg>--sitemapURLKey</arg><arg>${sitemapURLKey}</arg>
-            <arg>--dynamic</arg><arg>${dynamic}</arg>
-            <arg>--maxScrapedPages</arg><arg>${maxScrapedPages}</arg>
-        </java>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties
@ -1,4 +0,0 @@
-maxLimitScrape=200000
-schemaContext=https\://schema.org/docs/jsonldcontext.jsonld
-dynamic=true
-chromiumDriverLocation=/bin/chromedriver
--- a/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties
@ -1,9 +0,0 @@
-# Set root logger level to DEBUG and its only appender to A1.
-log4j.rootLogger=INFO, A1
-
-# A1 is set to be a ConsoleAppender.
-log4j.appender.A1=org.apache.log4j.ConsoleAppender
-
-# A1 uses PatternLayout.
-log4j.appender.A1.layout=org.apache.log4j.PatternLayout
-log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
--- a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java
+++ b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java
@ -1,45 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.bioschema;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-
-import org.apache.any23.Any23;
-import org.apache.any23.extractor.ExtractionException;
-import org.apache.any23.source.DocumentSource;
-import org.apache.any23.source.StringDocumentSource;
-import org.apache.any23.writer.NTriplesWriter;
-import org.apache.any23.writer.TripleHandler;
-import org.apache.any23.writer.TripleHandlerException;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.output.ByteArrayOutputStream;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class Html2TriplesTest {
-
-	static Logger logger = LoggerFactory.getLogger(Html2TriplesTest.class);
-
-	@Test
-//	@Disabled
-	void conversionTest() throws Exception {
-		InputStream is = Html2TriplesTest.class.getResourceAsStream("/eu/dnetlib/dhp/bmuse/bioschema/ped.html");
-		String page = IOUtils.toString(is, StandardCharsets.UTF_8.name());
-		DocumentSource source = new StringDocumentSource(page, "https://proteinensemble.org/PED00001");
-		Any23 runner = new Any23();
-		try (ByteArrayOutputStream out = new ByteArrayOutputStream();
-			TripleHandler handler = new NTriplesWriter(out);) {
-			runner.extract(source, handler);
-			logger.info(out.toString("UTF-8"));
-		} catch (ExtractionException e) {
-			logger.error("Cannot extract triples", e);
-		} catch (IOException e1) {
-			logger.error(" IO error whilst extracting triples", e1);
-		} catch (TripleHandlerException e2) {
-			logger.error("TripleHanderException", e2);
-		}
-
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java
+++ b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java
@ -1,24 +0,0 @@
-
-package eu.dnetlib.dhp.bmuse.bioschema;
-
-import org.jsoup.select.Elements;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.bmuse.utils.UrlParser;
-
-public class SitemapTest {
-
-	static Logger logger = LoggerFactory.getLogger(SitemapTest.class);
-
-	@Test
-	@Disabled
-	void sitemapGzTest() throws Exception {
-		Elements urls = UrlParser.getSitemapList("https://disprot.org/sitemap2.xml.gz", "loc");
-		urls.forEach(url -> {
-			logger.info(url.text());
-		});
-	}
-}
--- a/dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html
+++ b/dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html
--- a/dhp-workflows/dhp-rdfconverter/pom.xml
+++ b/dhp-workflows/dhp-rdfconverter/pom.xml
@ -4,7 +4,7 @@
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
        <artifactId>dhp-workflows</artifactId>
-        <version>1.2.4-SNAPSHOT</version>
+        <version>1.2.5-SNAPSHOT</version>
    </parent>
    <artifactId>dhp-rdfconverter</artifactId>