added app for bioschemas sources harvesting

2022-06-06 09:37:29 +02:00 · 2022-06-06 09:37:29 +02:00 · 7375534764
parent 19010a9624
commit 7375534764
21 changed files with 1475 additions and 0 deletions
--- a/apps/bioschemas-api/deploy.info
+++ b/apps/bioschemas-api/deploy.info
@ -0,0 +1,10 @@
+{
+		"type_source": "SVN",
+		"goal": "package -U source:jar",
+		"url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk/",
+		"deploy_repository": "dnet5-snapshots",
+		"version": "5",
+		"mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it, enrico.ottonello@isti.cnr.it",
+		"deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots",
+		"name": "dnet-ariadneplus-graphdb-publisher"
+}
--- a/apps/bioschemas-api/pom.xml
+++ b/apps/bioschemas-api/pom.xml
@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <artifactId>apps</artifactId>
+        <version>3.2.8-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+
+    <modelVersion>4.0.0</modelVersion>
+    <packaging>jar</packaging>
+    <artifactId>bioschemas-api</artifactId>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.springframework.boot</groupId>
+            <artifactId>spring-boot-starter-test</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>hwu.elixir</groupId>
+            <artifactId>bmuse-core</artifactId>
+            <version>0.5.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.freemarker</groupId>
+            <artifactId>freemarker</artifactId>
+            <version>2.3.27-incubating</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.any23</groupId>
+            <artifactId>apache-any23-core</artifactId>
+            <version>2.3</version>
+        </dependency>
+        <dependency>
+            <groupId>org.eclipse.rdf4j</groupId>
+            <artifactId>rdf4j-rio-rdfxml</artifactId>
+            <version>3.7.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.eclipse.rdf4j</groupId>
+            <artifactId>rdf4j-model</artifactId>
+            <version>3.7.1</version>
+        </dependency>
+        <!-- rdf 2.5.4 to 3.7.1-->
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.13.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.seleniumhq.selenium</groupId>
+            <artifactId>selenium-java</artifactId>
+            <version>3.141.59</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-io</groupId>
+            <artifactId>commons-io</artifactId>
+            <version>2.6</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-validator</groupId>
+            <artifactId>commons-validator</artifactId>
+            <version>1.6</version>
+        </dependency>
+        <dependency>
+            <groupId>ch.qos.logback</groupId>
+            <artifactId>logback-classic</artifactId>
+            <version>1.2.3</version>
+        </dependency>
+    </dependencies>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-help-plugin</artifactId>
+            </plugin>
+            <plugin>
+                <groupId>org.springframework.boot</groupId>
+                <artifactId>spring-boot-maven-plugin</artifactId>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
--- a/apps/bioschemas-api/pom.xml.original
+++ b/apps/bioschemas-api/pom.xml.original
@ -0,0 +1,173 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+	<parent>
+        <groupId>org.springframework.boot</groupId>
+        <artifactId>spring-boot-starter-parent</artifactId>
+        <version>2.1.3.RELEASE</version>
+    </parent>
+
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>eu.dnetlib</groupId>
+	<artifactId>dnet-bmuse-webapp</artifactId>
+	<packaging>jar</packaging>
+	<version>1.0.0-SNAPSHOT</version>
+	<scm>
+		<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk</developerConnection>
+		<url>https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp</url>
+	</scm>
+	<ciManagement>
+		<system>jenkins</system>
+		<url>https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/</url>
+	</ciManagement>
+	<distributionManagement>
+		<repository>
+			<id>dnet5-releases</id>
+			<name>D-Net 5 Releases</name>
+			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
+			<layout>default</layout>
+		</repository>
+	</distributionManagement>
+
+	<!-- Inherit defaults from Spring Boot -->
+
+	<repositories>
+		<repository>
+			<id>dnet-deps</id>
+			<name>D-Net Dependencies</name>
+			<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url>
+			<releases>
+				<enabled>true</enabled>
+			</releases>
+			<snapshots>
+				<enabled>false</enabled>
+			</snapshots>
+			<layout>default</layout>
+		</repository>
+		<repository>
+			<id>dnet5-releases</id>
+			<name>D-Net 5 Releases</name>
+			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
+			<layout>default</layout>
+			<snapshots>
+				<enabled>false</enabled>
+			</snapshots>
+		</repository>
+		<repository>
+			<id>dnet5-snapshots</id>
+			<name>D-Net 5 Snapshots</name>
+			<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots</url>
+			<layout>default</layout>
+			<snapshots>
+				<enabled>true</enabled>
+			</snapshots>
+		</repository>
+	</repositories>
+
+	<dependencies>
+		<dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.13-rc-1</version>
+            <scope>test</scope>
+        </dependency>
+		<dependency>
+			<groupId>org.springframework.boot</groupId>
+			<artifactId>spring-boot-starter-test</artifactId>
+			<scope>test</scope>
+		</dependency>
+		<dependency>
+			<groupId>org.springframework.boot</groupId>
+			<artifactId>spring-boot-autoconfigure</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>org.springframework.boot</groupId>
+			<artifactId>spring-boot</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>org.springframework.boot</groupId>
+			<artifactId>spring-boot-starter-web</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>hwu.elixir</groupId>
+			<artifactId>bmuse-core</artifactId>
+			<version>0.5.4</version>
+		</dependency>
+		<dependency>
+			<groupId>org.freemarker</groupId>
+			<artifactId>freemarker</artifactId>
+			<version>2.3.27-incubating</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.any23</groupId>
+			<artifactId>apache-any23-core</artifactId>
+			<version>2.3</version>
+		</dependency>
+		<dependency>
+			<groupId>org.eclipse.rdf4j</groupId>
+			<artifactId>rdf4j-rio-rdfxml</artifactId>
+			<version>3.7.1</version>
+		</dependency>
+		<dependency>
+			<groupId>org.eclipse.rdf4j</groupId>
+			<artifactId>rdf4j-model</artifactId>
+			<version>3.7.1</version>
+		</dependency>
+		<!-- rdf 2.5.4 to 3.7.1-->
+		<dependency>
+			<groupId>org.jsoup</groupId>
+			<artifactId>jsoup</artifactId>
+			<version>1.13.1</version>
+		</dependency>
+		<dependency>
+			<groupId>org.seleniumhq.selenium</groupId>
+			<artifactId>selenium-java</artifactId>
+			<version>3.141.59</version>
+		</dependency>
+		<dependency>
+			<groupId>commons-io</groupId>
+			<artifactId>commons-io</artifactId>
+			<version>2.6</version>
+		</dependency>
+		<dependency>
+			<groupId>commons-validator</groupId>
+			<artifactId>commons-validator</artifactId>
+			<version>1.6</version>
+		</dependency>
+		<dependency>
+			<groupId>ch.qos.logback</groupId>
+			<artifactId>logback-classic</artifactId>
+			<version>1.2.3</version>
+		</dependency>
+
+	</dependencies>
+
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.springframework.boot</groupId>
+				<artifactId>spring-boot-maven-plugin</artifactId>
+				<configuration>
+					<executable>true</executable>
+				</configuration>
+			</plugin>
+		</plugins>
+	</build>
+
+	<properties>
+		<java.version>1.8</java.version>
+		<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
+	</properties>
+
+	<profiles>
+		<profile>
+			<id>java8-doclint-disabled</id>
+			<activation>
+				<jdk>[1.8,)</jdk>
+			</activation>
+			<properties>
+				<javadoc.opts>-Xdoclint:none</javadoc.opts>
+			</properties>
+		</profile>
+	</profiles>
+</project>
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java
@ -0,0 +1,45 @@
+package eu.dnetlib.bmuse_webapp;
+
+import org.springframework.boot.web.client.RestTemplateBuilder;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.context.annotation.Profile;
+import org.springframework.web.client.RestTemplate;
+
+import freemarker.cache.ClassTemplateLoader;
+import freemarker.template.TemplateExceptionHandler;
+
+/**
+ * @author enrico.ottonello
+ *
+ */
+@Profile("garr")
+@Configuration
+public class AppConfigGarr {
+
+	@Bean
+	public RestTemplate jrrRestTemplate(){
+		//TODO: move configuration here from CatalogueRegistrator?
+		return new RestTemplateBuilder().build();
+	}
+
+
+	@Bean
+	public freemarker.template.Configuration freemarkerConfig(){
+		freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27);
+		ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql");
+		config.setTemplateLoader(ctl);
+		config.setDefaultEncoding("UTF-8");
+		// Sets how errors will appear.
+		// During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better.
+		config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
+
+		// Don't log exceptions inside FreeMarker that it will thrown at you anyway:
+		config.setLogTemplateExceptions(false);
+
+		// Wrap unchecked exceptions thrown during template processing into TemplateException-s.
+		config.setWrapUncheckedExceptions(true);
+
+		return config;
+	}
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java
@ -0,0 +1,42 @@
+package eu.dnetlib.bmuse_webapp;
+
+import eu.dnetlib.common.app.AbstractDnetApp;
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.cache.annotation.EnableCaching;
+import org.springframework.context.annotation.ComponentScan;
+import org.springframework.scheduling.annotation.EnableScheduling;
+import springfox.documentation.builders.ApiInfoBuilder;
+import springfox.documentation.builders.RequestHandlerSelectors;
+import springfox.documentation.service.ApiInfo;
+import springfox.documentation.spring.web.plugins.Docket;
+import springfox.documentation.swagger2.annotations.EnableSwagger2;
+
+@SpringBootApplication
+@EnableSwagger2
+@EnableCaching
+@EnableScheduling
+@ComponentScan(basePackages = "eu.dnetlib")
+public class MainApplication extends AbstractDnetApp {
+
+	public static void main(final String[] args) {
+		SpringApplication.run(MainApplication.class, args);
+	}
+
+	@Override
+	protected void configSwagger(final Docket docket) {
+		docket.select()
+			.apis(RequestHandlerSelectors.any())
+			.paths(p -> p.contains("/api/"))
+			.build()
+			.apiInfo(new ApiInfoBuilder()
+				.title("D-Net Bioschemas Service APIs")
+				.description("APIs documentation")
+				.version("1.1")
+				.contact(ApiInfo.DEFAULT_CONTACT)
+				.license("Apache 2.0")
+				.licenseUrl("http://www.apache.org/licenses/LICENSE-2.0")
+				.build());
+	}
+
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java
@ -0,0 +1,261 @@
+package eu.dnetlib.bmuse_webapp;
+
+import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
+import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper;
+import eu.dnetlib.bmuse_webapp.scraper.ScrapeState;
+import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread;
+import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper;
+import eu.dnetlib.bmuse_webapp.utils.UrlParser;
+import hwu.elixir.utils.Helpers;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+
+/** 
+ * Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
+ * Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL)
+ * and adds provenance to the CrawlRecord.
+ * 
+ *
+ */
+public class ServiceScrapeDriver {
+
+	private static final String propertiesFile = "application.properties";
+
+	private int waitTime = 1;
+	private int numberOfPagesToCrawlInALoop;
+	private int totalNumberOfPagesToCrawlInASession;
+	private String outputFolder;
+	private int pagesCounter = 0;
+	private int scrapeVersion = 1;
+
+	private String sitemapUrl;
+	private String sitemapURLKey;
+	private String maxScrapedPages;
+	private String outputFilename;
+	
+	private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
+
+	private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
+
+	public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) {
+		this.sitemapUrl = sitemapUrl;
+		this.sitemapURLKey = sitemapURLKey;
+		this.maxScrapedPages = maxScrapedPages;
+		this.outputFilename = outputFilename;
+	}
+
+	/**
+	 * Runs the scrape process
+	 *
+	 */
+	public void start() throws IOException {
+		runScrape();
+	}
+	
+	/** 
+	 * Fires off threads
+	 * Originally designed as a multi-threaded process; now reduced to a single thread as 
+	 * the selenium webdriver is too expensive to run multi-threaded. However, the threading
+	 * as been left in situ in case it is useful in the future.
+	 * 
+	 */
+	private void runScrape() throws IOException {
+		processProperties();
+		String url = sitemapUrl.toLowerCase();
+		Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
+		Stream<Element> urlStream = null;
+		if (Objects.nonNull(maxScrapedPages)) {
+			urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
+		} else {
+			urlStream = urls.stream();
+		}
+		List<Element> sites = urlStream.collect(Collectors.toList());
+		logger.info("Pages available for scraping: " + sites.size());
+
+		List<CrawlRecord> pagesToPull = generatePagesToPull(sites);
+		if (pagesToPull.isEmpty()) {
+			logger.error("Cannot retrieve URLs");
+			throw  new RuntimeException("No pages found from sitemap");
+		}
+
+		ScrapeState scrapeState = new ScrapeState(pagesToPull);
+
+		logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
+		while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
+			logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
+			
+			ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
+			scrape1.setName("S1");
+
+//			ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
+//			scrape2.setName("S2");
+//
+//			ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
+//			scrape3.setName("S3");
+//
+//			ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
+//			scrape4.setName("S4");
+
+			scrape1.start();
+//			scrape2.start();
+//			scrape3.start();
+//			scrape4.start();
+			long startTime = System.nanoTime();
+			
+			try {
+				scrape1.join();
+//				scrape2.join();
+//				scrape3.join();
+//				scrape4.join();
+			} catch (InterruptedException e) {
+				logger.error("Exception waiting on thread");
+				e.printStackTrace();
+				return;				
+			}
+			
+			if(!scrape1.isFileWritten()) {
+				logger.error("Could not write output file so shutting down!");
+				Date date = new Date(System.currentTimeMillis());
+				logger.info("ENDING CRAWL after failure at: " + formatter.format(date));					
+				return;
+			}
+			
+			logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
+
+			long endTime = System.nanoTime();
+			long timeElapsed = endTime - startTime;
+			logger.info("Time in s to complete: " + timeElapsed / 1e+9);
+
+			updateDatabase(scrapeState);
+			pagesCounter += numberOfPagesToCrawlInALoop;
+
+
+			logger.info("ENDED loop");
+		}
+
+//		Map<String, Object> nquads = scrapeState.getNquadsConcurrentHashMap();
+//		logger.info("Available nquads records: "+nquads.size() );
+
+		logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
+
+		File output = new File(outputFolder.concat("/").concat(outputFilename));
+		if (output.exists()) {
+			output.delete();
+			output.createNewFile();
+		}
+		FileWriter fileWriter;
+		BufferedWriter bufferedWriter;
+		fileWriter = new FileWriter(output.getAbsoluteFile(), true); // true to append
+		bufferedWriter = new BufferedWriter(fileWriter);
+
+		List<CrawlRecord> processed = scrapeState.getPagesProcessed();
+		for (int i=0;i<processed.size();i++) {
+			try {
+				bufferedWriter.write(processed.get(i).getNquads());
+				bufferedWriter.newLine();
+				bufferedWriter.flush();
+			} catch (IOException e) {
+				e.printStackTrace();
+			}
+		}
+		bufferedWriter.close();
+		logger.info(" dump to  "+output.getAbsolutePath());
+	}
+
+	/**
+	  *
+	 * @param scrapeState State of scrape at end
+	 * @return true if success / false otherwise
+	 * @see ScrapeState
+	 * @see CrawlRecord
+	 */
+	private boolean updateDatabase(ScrapeState scrapeState) {
+		boolean result = false;
+
+		return result;
+	}
+
+	/**
+	 * Get a list of URLs (in the form of CrawlRecords) that need to be scraped
+	 * 
+	 * @return List of URLs to be scraped
+	 * @see CrawlRecord
+	 */
+	private List<CrawlRecord> generatePagesToPull(List<Element> sites) {
+		List<CrawlRecord> crawls = sites
+				.stream()
+				.map(s -> {
+					CrawlRecord crawlRecord = new CrawlRecord(s.text());
+					String[] urlSplitted = crawlRecord.getUrl().split("/");
+					String name = urlSplitted[urlSplitted.length - 1];
+					crawlRecord.setName(name);
+					return crawlRecord;
+				})
+				.collect(Collectors.toList());
+		return crawls;
+	}
+
+	/**
+	 * Updates properties based on properties file in src > main > resources
+	 * 
+	 */
+	private void processProperties() {
+		ClassLoader classLoader = ServiceScrapeDriver.class.getClassLoader();
+
+		InputStream is = classLoader.getResourceAsStream(propertiesFile);
+		if(is == null) {
+			logger.error("     Cannot find " + propertiesFile + " file");
+			throw new IllegalArgumentException(propertiesFile + "file is not found!");
+		}
+
+		Properties prop = new Properties();
+
+		try {
+			prop.load(is);
+		} catch (IOException e) {
+			logger.error("     Cannot load application.properties", e);
+			System.exit(0);
+		}
+
+		waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
+		logger.info("     waitTime: " + waitTime);
+		outputFolder = prop.getProperty("outputFolder").trim();
+		logger.info("     outputFolder: " + outputFolder);		
+		numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
+		logger.info("     numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
+		totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());
+		logger.info("     totalNumberOfPagesToCrawlInASession: " + totalNumberOfPagesToCrawlInASession);
+		scrapeVersion = Integer.parseInt(prop.getProperty("scrapeVersion").trim());
+		logger.info("     scrapeVersion: " + scrapeVersion);		
+		logger.info("\n\n\n");
+	}
+
+	public String getSitemapUrl() {
+		return sitemapUrl;
+	}
+
+	public String getSitemapURLKey() {
+		return sitemapURLKey;
+	}
+
+	private String getId(String pageUrl) {
+		String[] parts = pageUrl.split("/");
+		return parts[parts.length - 1];
+	}
+
+
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java
@ -0,0 +1,136 @@
+package eu.dnetlib.bmuse_webapp.crawl;
+
+import java.util.Date;
+
+import hwu.elixir.utils.Validation;
+
+
+/**
+ * 
+ * Store the current status of a single URL in the scrape service.
+ * 
+ *
+ */
+
+
+public class CrawlRecord {
+	
+	private Long id;
+	
+	private String context = "";
+
+	private String url;
+
+	private Date dateScraped;
+
+	private StatusOfScrape status;
+	
+	private boolean beingScraped;
+
+	private String name;
+
+	private String nquads;
+
+	public CrawlRecord() {		
+		status = StatusOfScrape.UNTRIED;
+	}
+
+	public CrawlRecord(String url) {
+		Validation validation = new Validation();
+		if(validation.validateURI(url)) {
+			this.url = url;
+			context = "";
+			status = StatusOfScrape.UNTRIED;
+			dateScraped = null;
+		} else {
+			throw new IllegalArgumentException(url +" is not a valid url");
+		}
+		this.setId(System.currentTimeMillis());
+	}
+
+	public Long getId() {
+		return id;
+	}
+
+	public void setId(Long id) {
+		this.id = id;
+	}
+
+	public String getUrl() {
+		return url;
+	}
+
+	public Date getDateScraped() {			
+		return dateScraped;
+	}
+
+	public void setDateScraped(Date dateScraped) {
+		this.dateScraped = dateScraped;
+	}
+
+	public StatusOfScrape getStatus() {
+		return status;
+	}
+
+	public void setStatus(StatusOfScrape status) {
+		this.status = status;
+	}
+
+	public String getContext() {
+		return context;
+	}
+
+	public void setContext(String context) {
+		this.context = context;
+	}
+	
+	public boolean isBeingScraped() {
+		return beingScraped;
+	}
+
+	public void setBeingScraped(boolean beingScraped) {
+		this.beingScraped = beingScraped;
+	}
+
+	public String getName() {
+		return name;
+	}
+
+	public void setName(String name) {
+		this.name = name;
+	}
+
+	public String getNquads() {
+		return nquads;
+	}
+
+	public void setNquads(String nquads) {
+		this.nquads = nquads;
+	}
+
+	@Override
+	public boolean equals(Object o) {
+		if (this == o)
+			return true;
+		if (!(o instanceof CrawlRecord))
+			return false;
+
+		CrawlRecord otherCrawl = (CrawlRecord) o;
+
+		if(this.url.equals(otherCrawl.getUrl())) {
+			return true;
+		}
+
+		return false;
+	}
+
+	@Override
+	public int hashCode() {
+		int result = getId() != null ? getId().hashCode() : 0;
+		result = 31 * result + (getUrl() != null ? getUrl().hashCode() : 0);
+		result = 31 * result + (getContext() != null ? getContext().hashCode() : 0);
+		result = 31 * result + (getDateScraped() != null ? getDateScraped().hashCode() : 0);
+		return result;
+	}
+
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java
@ -0,0 +1,19 @@
+package eu.dnetlib.bmuse_webapp.crawl;
+
+/**
+ * 
+ * {@link eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape} describes the possible status levels the scrape for each URL/CrawlRecord.
+ * 
+ * Each URL/CrawlRecord can have one of the following:
+ * DOES_NOT_EXIST = 404.
+ * HUMAN_INSPECTION = cannot parse for some reason; a human should see what is happening.
+ * UNTRIED = not scraped yet.
+ * FAILED = one failed attempt at scraping; will try again.
+ * GIVEN_UP = two failed attempts at scraping. Will not try again.
+ * SUCCESS = successfully scraped.
+ *
+ */
+
+public enum StatusOfScrape {
+	DOES_NOT_EXIST, HUMAN_INSPECTION, UNTRIED, FAILED, GIVEN_UP, SUCCESS;
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java
@ -0,0 +1,65 @@
+package eu.dnetlib.bmuse_webapp.publisher;
+
+import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
+import eu.dnetlib.common.controller.AbstractDnetController;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.LineIterator;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tomcat.jni.FileInfo;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.web.bind.annotation.*;
+
+import javax.servlet.http.HttpServletResponse;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+/**
+ * @author enrico.ottonello
+ *
+ */
+
+@RestController
+@RequestMapping("/api")
+public class BMUSEWebappController extends AbstractDnetController {
+
+	private static final Log log = LogFactory.getLog(BMUSEWebappController.class);
+
+	@RequestMapping(value = "/version", method = RequestMethod.GET)
+	public String version() throws BMUSEWebappException {
+		return "1.0.0-SNAPSHOT";
+	}
+
+	@RequestMapping(value = "/scrape", method = RequestMethod.GET)
+	public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
+
+		log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
+//		String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
+//		String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
+//		String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
+		String sitemapUrlKey = "loc";
+		String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt");
+		ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
+		service.start();
+		return "started";
+	}
+
+	@RequestMapping(value = "/nquads", method = RequestMethod.GET)
+	public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException {
+		LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8");
+		try {
+			while (it.hasNext()) {
+				String line = it.nextLine();
+				response.getOutputStream().write(line.getBytes(StandardCharsets.UTF_8));
+				response.getOutputStream().println();
+			}
+		} finally {
+		}
+		return "";
+	}
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java
@ -0,0 +1,28 @@
+package eu.dnetlib.bmuse_webapp.publisher;
+
+/**
+ * @author enrico.ottonello
+ *
+ */
+
+public class BMUSEWebappException extends Exception{
+
+	public BMUSEWebappException() {
+	}
+
+	public BMUSEWebappException(final String message) {
+		super(message);
+	}
+
+	public BMUSEWebappException(final String message, final Throwable cause) {
+		super(message, cause);
+	}
+
+	public BMUSEWebappException(final Throwable cause) {
+		super(cause);
+	}
+
+	public BMUSEWebappException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) {
+		super(message, cause, enableSuppression, writableStackTrace);
+	}
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java
@ -0,0 +1,17 @@
+package eu.dnetlib.bmuse_webapp.publisher;
+
+import eu.dnetlib.common.controller.AbstractDnetController;
+import org.springframework.stereotype.Controller;
+import org.springframework.web.bind.annotation.GetMapping;
+
+@Controller
+public class HomeController extends AbstractDnetController {
+
+	@GetMapping({
+		"/doc", "/swagger"
+	})
+	public String apiDoc() {
+		return "redirect:swagger-ui/";
+	}
+
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java
@ -0,0 +1,90 @@
+
+package eu.dnetlib.bmuse_webapp.scraper;
+
+import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
+import hwu.elixir.scrape.exceptions.MissingMarkupException;
+import hwu.elixir.scrape.scraper.ScraperFilteredCore;
+import org.apache.any23.Any23;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.source.DocumentSource;
+import org.apache.any23.source.StringDocumentSource;
+import org.apache.any23.writer.NTriplesWriter;
+import org.apache.any23.writer.TripleHandler;
+import org.apache.any23.writer.TripleHandlerException;
+import org.apache.commons.io.output.ByteArrayOutputStream;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Model;
+import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
+import org.eclipse.rdf4j.rio.RDFFormat;
+import org.eclipse.rdf4j.rio.Rio;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.StringWriter;
+
+public class BMUSEScraper extends ScraperFilteredCore {
+
+	private static final Log logger = LogFactory.getLog(BMUSEScraper.class);
+
+	public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
+		logger.debug(url + " > scraping");
+		url = fixURL(url);
+
+		String html = "";
+		// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
+		// (dynamic and static respectively)
+
+		if (dynamic) {
+			html = wrapHTMLExtraction(url);
+		} else {
+			html = wrapHTMLExtractionStatic(url);
+		}
+
+		if (html == null || html.contentEquals(""))
+			throw new Exception("empty html");
+
+		html = injectId(html, url);
+
+		logger.debug(url + " > html scraped from " + url);
+		DocumentSource source = new StringDocumentSource(html, url);
+		String n3 = html2Triples(source, url);
+		if (n3 == null) {
+			throw new MissingMarkupException(url);
+		}
+
+		logger.debug(url + " > processing triples");
+		IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
+		Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
+		if (updatedModel == null) {
+			throw new Exception("rdf model null");
+		}
+
+		logger.debug(url + " > generating nquads");
+		try (StringWriter jsonLDWriter = new StringWriter()) {
+			Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
+			logger.debug(url + " > nquads generated");
+			return jsonLDWriter.toString();
+		} catch (Exception e) {
+			throw e;
+		}
+	}
+
+	private String html2Triples(DocumentSource source, String url) throws Exception {
+		Any23 runner = new Any23();
+		try (ByteArrayOutputStream out = new ByteArrayOutputStream();
+			TripleHandler handler = new NTriplesWriter(out);) {
+			runner.extract(source, handler);
+			return out.toString("UTF-8");
+		} catch (ExtractionException e) {
+			logger.error("Cannot extract triples", e);
+		} catch (IOException e1) {
+			logger.error(" IO error whilst extracting triples", e1);
+		} catch (TripleHandlerException e2) {
+			logger.error("TripleHanderException", e2);
+		}
+		return null;
+	}
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java
@ -0,0 +1,157 @@
+package eu.dnetlib.bmuse_webapp.scraper;
+
+import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
+import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+
+ */
+public class ScrapeState {
+
+	private List<CrawlRecord> urlsToScrape = Collections.synchronizedList(new ArrayList<CrawlRecord>());
+	private List<CrawlRecord> urlsProcessed = Collections.synchronizedList(new ArrayList<CrawlRecord>());  // should this be a set?
+	private Map<String, Object> nquadsConcurrentHashMap = new ConcurrentHashMap<>();
+
+	/**
+	 * 
+	 * @param pagesToBeScraped The list of sites to be scraped
+	 * @see ScrapeThread
+	 * @see CrawlRecord
+	 */
+	public ScrapeState(List<CrawlRecord> pagesToBeScraped) {
+		urlsToScrape.addAll(pagesToBeScraped);
+	}
+
+	/**
+	 * Any pages/URLs left to scrape?
+	 * @return True for yes & false for no
+	 * @see CrawlRecord
+	 */
+	public synchronized boolean pagesLeftToScrape() {
+		return !urlsToScrape.isEmpty();
+	}
+
+	/**
+	 * Returns the next URL/CrawlRecord to be scraped
+	 * 
+	 * @return First page/URL that needs to be scraped next
+	 * @see CrawlRecord
+	 */
+	public synchronized CrawlRecord getURLToProcess() {
+		if (urlsToScrape.isEmpty())
+			return null;
+
+		return urlsToScrape.remove(0);
+	}
+
+	/**
+	 * Adds the given CrawlRecord to the list of CrawlRecords successfully scraped.
+	 * Updates the status of the CrawlRecord to SUCCESS.
+	 * 
+	 * @param url The latest URL/page that has been successfully scraped
+	 * @see CrawlRecord
+	 */
+	public synchronized void addSuccessfulScrapedURL(CrawlRecord record) {
+		record.setStatus(StatusOfScrape.SUCCESS);
+		urlsProcessed.add(record);
+	}
+
+	/**
+	 * Adds the given CrawlRecord to the list of CrawlRecords NOT successfully scraped.
+	 * Updates the status of the CrawlRecord; if first failure the status is FAILED.
+	 * If status is already FAILED it is changed to GIVEN_UP.
+	 * 
+	 * If the status is FAILED, another try will be made in a future run.
+	 *  
+	 * 
+	 * @param url The latest URL/page that has been unsuccessfully scraped
+	 * @see CrawlRecord
+	 */
+	public synchronized void addFailedToScrapeURL(CrawlRecord record) {
+		if (record.getStatus().equals(StatusOfScrape.FAILED)) {
+			record.setStatus(StatusOfScrape.GIVEN_UP);
+		} else {
+			record.setStatus(StatusOfScrape.FAILED);
+		}
+		urlsProcessed.add(record);
+	}
+
+	/**
+	 * Changes the status of the CrawlRecord to DOES_NOT_EXIST.
+	 * As Selenium does not return the HTTP codes, it is questionable 
+	 * how useful this is.
+	 * 
+	 * 
+	 * @param url The latest URL/page that has been 404'd
+	 * @see CrawlRecord
+	 */
+	public synchronized void setStatusTo404(CrawlRecord record) {
+		record.setStatus(StatusOfScrape.DOES_NOT_EXIST);
+		urlsProcessed.add(record);
+	}	
+	
+	
+	/**
+	 * 
+	 * Changes the status of the CrawlRecord to HUMAN_INSPECTION.
+	 * This captures the idea that the URLs may contain unexpected markup that needs a human to 
+	 * review and possibly update the scraper. 
+	 * 
+	 * @param url The latest URL/page that needs human inspection
+	 * @see CrawlRecord
+	 */
+	public synchronized void setStatusToHumanInspection(CrawlRecord record) {
+		record.setStatus(StatusOfScrape.HUMAN_INSPECTION);
+		urlsProcessed.add(record);
+	}	
+	
+	
+	/**
+	 * Returns the number of URLs that are still to be scraped in this cycle. 
+	 * This does not return the number of URLs left to scrape in the DBMS, just in the current cycle.
+	 * 
+	 * @return Number of URLs left to scrape in this cycle
+	 * @see CrawlRecord
+	 */
+	public synchronized int getNumberPagesLeftToScrape() {
+		return urlsToScrape.size();
+	}
+
+	/**
+	 * Gets the full list of URLs that have been processed in this cycle.
+	 * This does not return the number of URLs that have been scraped in total across all cycles.
+	 * 
+	 * @return
+	 * @see CrawlRecord
+	 */
+	public synchronized List<CrawlRecord> getPagesProcessed() {
+		return urlsProcessed;
+	}
+	
+	/**
+	 * Gets the full list of URLs/CrawlRecords regardless of whether scraped or not in the current cycle.
+	 * 
+	 * @return List of all CrawlRecords in this cycle.
+	 * @see CrawlRecord
+	 */
+	public synchronized List<CrawlRecord> getPagesProcessedAndUnprocessed() {
+		List<CrawlRecord> urlsCombined = Collections.synchronizedList(new ArrayList<CrawlRecord>());
+		urlsCombined.addAll(urlsProcessed);
+		urlsCombined.addAll(urlsToScrape);
+		return urlsCombined;
+	}
+
+	public void addNquads(String key, String nquads) {
+		nquadsConcurrentHashMap.putIfAbsent(key, nquads);
+	}
+
+	public Map<String, Object> getNquadsConcurrentHashMap() {
+		return nquadsConcurrentHashMap;
+	}
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java
@ -0,0 +1,109 @@
+package eu.dnetlib.bmuse_webapp.scraper;
+
+import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
+import eu.dnetlib.bmuse_webapp.utils.CompressorUtil;
+import hwu.elixir.scrape.exceptions.CannotWriteException;
+import hwu.elixir.scrape.exceptions.FourZeroFourException;
+import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
+import hwu.elixir.scrape.exceptions.MissingMarkupException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+//import org.apache.commons.logging.Log;
+//import org.apache.commons.logging.LogFactory;
+//import org.slf4j.Logger;
+//import org.slf4j.LoggerFactory;
+
+import java.util.Date;
+
+/**
+ *
+ * @see BMUSEScraper
+ * @see ScrapeState
+ *
+ */
+public class ScrapeThread extends Thread {
+	private ScrapeState scrapeState;
+	private BMUSEScraper process;
+	private int waitTime;
+	private boolean fileWritten = true;
+	private int scrapeVersion = 1;
+
+	private static final Log logger = LogFactory.getLog(ScrapeThread.class);
+
+	/**
+	 * Sets up a thread for actually scrapping. 
+	 * 
+	 * @param scraper Scraper that will actually do the scraping.
+	 * @param scrapeState Object that maintains state across threads.
+	 * @param waitTime    How long (in seconds) thread should wait after scraping
+	 *                    page before attempting new page.
+	 * @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled.
+	 *
+	 */
+	public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) {
+		this.scrapeState = scrapeState;
+		process = scraper;
+		this.waitTime = waitTime;
+		this.scrapeVersion = contextVersion;
+	}
+
+	@Override
+	/**
+	 * Defines high-level process of scraping. Actual scraping done by an
+	 * implementation of Scraper. If page scrape successful will add url to
+	 * Scrape.sitesScraped
+	 * 
+	 * @see Scraper
+	 * @see SimpleScraper
+	 */
+	public void run() {
+		while (scrapeState.pagesLeftToScrape()) {
+			CrawlRecord record = scrapeState.getURLToProcess();
+
+			if (record == null)
+				break;
+
+			record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId());			
+			record.setDateScraped(new Date());
+			
+			try {
+				String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
+//				scrapeState.addNquads(record.getName(), nquads);
+				logger.info("downloaded "+record.getUrl());
+				record.setNquads(CompressorUtil.compressValue(nquads));
+				if (!nquads.isEmpty()) {
+					scrapeState.addSuccessfulScrapedURL(record);
+				} else {
+					scrapeState.addFailedToScrapeURL(record);
+				}
+			} catch(FourZeroFourException fourZeroFourException) {
+				scrapeState.setStatusTo404(record);
+				fileWritten = false;
+			} catch (JsonLDInspectionException je) {
+				scrapeState.setStatusToHumanInspection(record);
+				fileWritten = false;
+			} catch (CannotWriteException cannotWrite) {		
+				logger.error("Caught cannot read file, setting worked to false!");
+				fileWritten = false;
+				scrapeState.addFailedToScrapeURL(record);
+				return; // no point in continuing
+			} catch (MissingMarkupException e) {
+				logger.error("Cannot obtain markup from " + record.getUrl() +".");
+				fileWritten = false;
+				scrapeState.addFailedToScrapeURL(record);
+			} catch (Exception e) {
+				e.printStackTrace();
+			}
+			try {
+				ScrapeThread.sleep(100 * waitTime);
+			} catch (InterruptedException e) {
+				e.printStackTrace();
+			}
+		}
+		process.shutdown();
+	}
+	
+	public boolean isFileWritten() {
+		return fileWritten;
+	}
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java
@ -0,0 +1,72 @@
+package eu.dnetlib.bmuse_webapp.scraper;
+
+import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
+import hwu.elixir.scrape.exceptions.*;
+import hwu.elixir.scrape.scraper.ScraperFilteredCore;
+import org.apache.commons.lang.time.DateUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Provides the
+ * actual scraping functionality.
+ * 
+ * Scrapes a given URL, converts into NQuads and writes to a file (name derived
+ * from URL). If the file already exists it will be overwritten.
+ * 
+ * 
+ * @see ScraperFilteredCore
+ * 
+ */
+public class ServiceScraper extends ScraperFilteredCore {
+
+	private static Logger logger = LoggerFactory.getLogger(System.class.getName());
+
+	private StatusOfScrape status= null;
+
+
+	/**
+	 * Orchestrates the process of scraping a site before converting the extracted
+	 * triples to NQuads and writing to a file.
+	 * 
+	 * @param url               Site to be scraped
+	 * @param contextCounter    Number used to generate the named graph/context and
+	 *                          the URLs used to replace blank nodes.
+	 * @param outputFolderName Location to which the NQuads will be written
+	 * @return True if success; false otherwise
+	 * @throws FourZeroFourException 
+	 * @throws JsonLDInspectionException
+	 * @throws CannotWriteException 
+	 * @throws MissingMarkupException 
+	 * 
+	 */
+	public boolean scrape(String url, Long contextCounter, String outputFolderName, String fileName, StatusOfScrape status) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException {
+		this.status = status;
+		logger.info("scraping "+url + " to "+fileName);
+		return scrape(url, outputFolderName, fileName, contextCounter, true);
+	}
+	
+	
+
+	@Override	
+	/* Now takes account of StateOfCrawl
+	 */
+	protected String wrapHTMLExtraction(String url) throws FourZeroFourException {
+		String html = "";
+		if (status.equals(StatusOfScrape.UNTRIED) || status.equals(StatusOfScrape.FAILED)) {
+			try {
+				html = getHtmlViaSelenium(url);
+			} catch (SeleniumException e) {
+				// try again
+				try {
+					html = getHtmlViaSelenium(url);
+				} catch (SeleniumException e2) {
+					return "";
+				}
+			}
+		} else {
+			return "";
+		}
+		return html;
+	}
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java
@ -0,0 +1,34 @@
+package eu.dnetlib.bmuse_webapp.utils;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.io.IOUtils;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.StringWriter;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+public class CompressorUtil {
+
+	public static String decompressValue(final String abstractCompressed) {
+		try {
+			byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
+			GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
+			final StringWriter stringWriter = new StringWriter();
+			IOUtils.copy(gis, stringWriter);
+			return stringWriter.toString();
+		} catch (IOException e) {
+			throw new IllegalArgumentException(e);
+		}
+	}
+
+    public static  String compressValue(final String value) throws IOException {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        GZIPOutputStream gzip = new GZIPOutputStream(out);
+        gzip.write(value.getBytes());
+        gzip.close();
+        return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
+    }
+}
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java
@ -0,0 +1,64 @@
+
+package eu.dnetlib.bmuse_webapp.utils;
+
+import hwu.elixir.utils.Helpers;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+public class UrlParser {
+
+	private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
+
+	public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
+
+		Document doc = new Document(url);
+		Document urlSitemapListsNested;
+		Elements elements = new Elements();
+		Elements sitemaps = new Elements();
+		boolean sitemapindex = false;
+		boolean urlset = false;
+
+		try {
+			int urlLength = url.length();
+			logger.info("parse sitemap list");
+			String sitemapExt = url.substring(urlLength - 3, urlLength);
+			if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
+				logger.info("compressed sitemap");
+				byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
+				doc = Helpers.gzipFileDecompression(bytes);
+			} else {
+				doc = Jsoup.connect(url).maxBodySize(0).get();
+			}
+
+		} catch (IOException e) {
+			logger.error("Jsoup parsing exception: " + e.getMessage());
+		}
+
+		try {
+
+			elements = doc.select(sitemapURLKey);
+
+			// check the html if it is a sitemapindex or a urlset
+			sitemapindex = doc.outerHtml().contains("sitemapindex");
+			urlset = doc.outerHtml().contains("urlset");
+		} catch (NullPointerException e) {
+			logger.error(e.getMessage());
+		}
+
+		if (sitemapindex) {
+			// if sitemapindex get the loc of all the sitemaps
+			// added warning for sitemap index files
+			logger
+				.warn(
+					"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
+			sitemaps = doc.select(sitemapURLKey);
+		}
+
+		return elements;
+	}
+}
--- a/apps/bioschemas-api/src/main/resources/application.properties
+++ b/apps/bioschemas-api/src/main/resources/application.properties
@ -0,0 +1,24 @@
+server.servlet.context-path=/dnet-bmuse-webapp
+server.port=8281
+
+spring.profiles.active=garr
+
+logging.file.name = /var/log/springboot/9480/oa_organizations.log
+
+maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
+
+spring.main.banner-mode = off
+
+logging.level.root      = INFO
+
+management.endpoints.web.exposure.include   =   prometheus,health
+management.endpoints.web.base-path                  =   /
+management.endpoints.web.path-mapping.prometheus    =   metrics
+management.endpoints.web.path-mapping.health        =   health
+
+waitTime=5
+outputFolder=/Users/enrico.ottonello/data/bmuse-output
+numberOfPagesToCrawlInALoop=8
+totalNumberOfPagesToCrawlInASession=32
+chromiumDriverLocation = /usr/local/bin/chromedriver
+scrapeVersion=1
--- a/apps/bioschemas-api/src/main/resources/logback-spring.xml
+++ b/apps/bioschemas-api/src/main/resources/logback-spring.xml
@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+	<appender name="SAVE-TO-FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
+	<file>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log</file>
+	<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
+        <Pattern>%d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n</Pattern>
+    </encoder>
+    <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
+        <fileNamePattern>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log</fileNamePattern>
+        <maxFileSize>10MB</maxFileSize>
+        <maxHistory>10</maxHistory>
+        <totalSizeCap>100MB</totalSizeCap>
+    </rollingPolicy>
+	</appender>
+	<logger name="org.springframework" level="INFO" additivity="false">
+		<appender-ref ref="SAVE-TO-FILE" />
+	</logger>
+	<logger name="root" level="INFO" additivity="false">
+		<appender-ref ref="SAVE-TO-FILE" />
+	</logger>
+	<logger name="eu.dnetlib" level="INFO" additivity="false">
+		<appender-ref ref="SAVE-TO-FILE" />
+	</logger>
+	<logger name="eu.dnetlib.bmuse_webapp" level="INFO" additivity="false">
+		<appender-ref ref="SAVE-TO-FILE" />
+	</logger>
+	<logger name="hwu.elixir" level="INFO" additivity="false">
+		<appender-ref ref="SAVE-TO-FILE" />
+	</logger>
+</configuration>
--- a/apps/pom.xml
+++ b/apps/pom.xml
@ -18,6 +18,7 @@
 		<module>dnet-orgs-database-application</module>
 		<module>dnet-exporter-api</module>
 		<module>scholexplorer-api</module>
+		<module>bioschemas-api</module>
 	</modules>

 	<dependencies>
--- a/pom.xml
+++ b/pom.xml
@ -88,6 +88,18 @@
 			<name>Cloudera Repository</name>
 			<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
 		</repository>
+		<repository>
+			<id>dnet-deps</id>
+			<name>D-Net Dependencies</name>
+			<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url>
+			<releases>
+				<enabled>true</enabled>
+			</releases>
+			<snapshots>
+				<enabled>false</enabled>
+			</snapshots>
+			<layout>default</layout>
+		</repository>
 	</repositories>

 	<dependencies>