diff --git a/apps/bioschemas-api/deploy.info b/apps/bioschemas-api/deploy.info
new file mode 100644
index 00000000..015b818c
--- /dev/null
+++ b/apps/bioschemas-api/deploy.info
@@ -0,0 +1,10 @@
+{
+ "type_source": "SVN",
+ "goal": "package -U source:jar",
+ "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk/",
+ "deploy_repository": "dnet5-snapshots",
+ "version": "5",
+ "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it, enrico.ottonello@isti.cnr.it",
+ "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots",
+ "name": "dnet-ariadneplus-graphdb-publisher"
+}
\ No newline at end of file
diff --git a/apps/bioschemas-api/pom.xml b/apps/bioschemas-api/pom.xml
new file mode 100644
index 00000000..4bcb0408
--- /dev/null
+++ b/apps/bioschemas-api/pom.xml
@@ -0,0 +1,86 @@
+
+
+
+ eu.dnetlib.dhp
+ apps
+ 3.2.8-SNAPSHOT
+ ../pom.xml
+
+
+ 4.0.0
+ jar
+ bioschemas-api
+
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+ hwu.elixir
+ bmuse-core
+ 0.5.4
+
+
+ org.freemarker
+ freemarker
+ 2.3.27-incubating
+
+
+
+ org.apache.any23
+ apache-any23-core
+ 2.3
+
+
+ org.eclipse.rdf4j
+ rdf4j-rio-rdfxml
+ 3.7.1
+
+
+ org.eclipse.rdf4j
+ rdf4j-model
+ 3.7.1
+
+
+
+ org.jsoup
+ jsoup
+ 1.13.1
+
+
+ org.seleniumhq.selenium
+ selenium-java
+ 3.141.59
+
+
+ commons-io
+ commons-io
+ 2.6
+
+
+ commons-validator
+ commons-validator
+ 1.6
+
+
+ ch.qos.logback
+ logback-classic
+ 1.2.3
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-help-plugin
+
+
+ org.springframework.boot
+ spring-boot-maven-plugin
+
+
+
+
+
\ No newline at end of file
diff --git a/apps/bioschemas-api/pom.xml.original b/apps/bioschemas-api/pom.xml.original
new file mode 100644
index 00000000..f79644f5
--- /dev/null
+++ b/apps/bioschemas-api/pom.xml.original
@@ -0,0 +1,173 @@
+
+
+
+ org.springframework.boot
+ spring-boot-starter-parent
+ 2.1.3.RELEASE
+
+
+ 4.0.0
+ eu.dnetlib
+ dnet-bmuse-webapp
+ jar
+ 1.0.0-SNAPSHOT
+
+ scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk
+ https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp
+
+
+ jenkins
+ https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/
+
+
+
+ dnet5-releases
+ D-Net 5 Releases
+ http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases
+ default
+
+
+
+
+
+
+
+ dnet-deps
+ D-Net Dependencies
+ https://maven.d4science.org/nexus/content/repositories/dnet-deps/
+
+ true
+
+
+ false
+
+ default
+
+
+ dnet5-releases
+ D-Net 5 Releases
+ http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases
+ default
+
+ false
+
+
+
+ dnet5-snapshots
+ D-Net 5 Snapshots
+ http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots
+ default
+
+ true
+
+
+
+
+
+
+ junit
+ junit
+ 4.13-rc-1
+ test
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+ org.springframework.boot
+ spring-boot-autoconfigure
+
+
+ org.springframework.boot
+ spring-boot
+
+
+ org.springframework.boot
+ spring-boot-starter-web
+
+
+ hwu.elixir
+ bmuse-core
+ 0.5.4
+
+
+ org.freemarker
+ freemarker
+ 2.3.27-incubating
+
+
+
+ org.apache.any23
+ apache-any23-core
+ 2.3
+
+
+ org.eclipse.rdf4j
+ rdf4j-rio-rdfxml
+ 3.7.1
+
+
+ org.eclipse.rdf4j
+ rdf4j-model
+ 3.7.1
+
+
+
+ org.jsoup
+ jsoup
+ 1.13.1
+
+
+ org.seleniumhq.selenium
+ selenium-java
+ 3.141.59
+
+
+ commons-io
+ commons-io
+ 2.6
+
+
+ commons-validator
+ commons-validator
+ 1.6
+
+
+ ch.qos.logback
+ logback-classic
+ 1.2.3
+
+
+
+
+
+
+
+ org.springframework.boot
+ spring-boot-maven-plugin
+
+ true
+
+
+
+
+
+
+ 1.8
+ false
+
+
+
+
+ java8-doclint-disabled
+
+ [1.8,)
+
+
+ -Xdoclint:none
+
+
+
+
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java
new file mode 100644
index 00000000..634f9172
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java
@@ -0,0 +1,45 @@
+package eu.dnetlib.bmuse_webapp;
+
+import org.springframework.boot.web.client.RestTemplateBuilder;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.context.annotation.Profile;
+import org.springframework.web.client.RestTemplate;
+
+import freemarker.cache.ClassTemplateLoader;
+import freemarker.template.TemplateExceptionHandler;
+
+/**
+ * @author enrico.ottonello
+ *
+ */
+@Profile("garr")
+@Configuration
+public class AppConfigGarr {
+
+ @Bean
+ public RestTemplate jrrRestTemplate(){
+ //TODO: move configuration here from CatalogueRegistrator?
+ return new RestTemplateBuilder().build();
+ }
+
+
+ @Bean
+ public freemarker.template.Configuration freemarkerConfig(){
+ freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27);
+ ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql");
+ config.setTemplateLoader(ctl);
+ config.setDefaultEncoding("UTF-8");
+ // Sets how errors will appear.
+ // During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better.
+ config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
+
+ // Don't log exceptions inside FreeMarker that it will thrown at you anyway:
+ config.setLogTemplateExceptions(false);
+
+ // Wrap unchecked exceptions thrown during template processing into TemplateException-s.
+ config.setWrapUncheckedExceptions(true);
+
+ return config;
+ }
+}
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java
new file mode 100644
index 00000000..44bbd5ad
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java
@@ -0,0 +1,42 @@
+package eu.dnetlib.bmuse_webapp;
+
+import eu.dnetlib.common.app.AbstractDnetApp;
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.cache.annotation.EnableCaching;
+import org.springframework.context.annotation.ComponentScan;
+import org.springframework.scheduling.annotation.EnableScheduling;
+import springfox.documentation.builders.ApiInfoBuilder;
+import springfox.documentation.builders.RequestHandlerSelectors;
+import springfox.documentation.service.ApiInfo;
+import springfox.documentation.spring.web.plugins.Docket;
+import springfox.documentation.swagger2.annotations.EnableSwagger2;
+
+@SpringBootApplication
+@EnableSwagger2
+@EnableCaching
+@EnableScheduling
+@ComponentScan(basePackages = "eu.dnetlib")
+public class MainApplication extends AbstractDnetApp {
+
+ public static void main(final String[] args) {
+ SpringApplication.run(MainApplication.class, args);
+ }
+
+ @Override
+ protected void configSwagger(final Docket docket) {
+ docket.select()
+ .apis(RequestHandlerSelectors.any())
+ .paths(p -> p.contains("/api/"))
+ .build()
+ .apiInfo(new ApiInfoBuilder()
+ .title("D-Net Bioschemas Service APIs")
+ .description("APIs documentation")
+ .version("1.1")
+ .contact(ApiInfo.DEFAULT_CONTACT)
+ .license("Apache 2.0")
+ .licenseUrl("http://www.apache.org/licenses/LICENSE-2.0")
+ .build());
+ }
+
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java
new file mode 100644
index 00000000..3b1ab451
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java
@@ -0,0 +1,261 @@
+package eu.dnetlib.bmuse_webapp;
+
+import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
+import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper;
+import eu.dnetlib.bmuse_webapp.scraper.ScrapeState;
+import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread;
+import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper;
+import eu.dnetlib.bmuse_webapp.utils.UrlParser;
+import hwu.elixir.utils.Helpers;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+
+/**
+ * Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
+ * Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL)
+ * and adds provenance to the CrawlRecord.
+ *
+ *
+ */
+public class ServiceScrapeDriver {
+
+ private static final String propertiesFile = "application.properties";
+
+ private int waitTime = 1;
+ private int numberOfPagesToCrawlInALoop;
+ private int totalNumberOfPagesToCrawlInASession;
+ private String outputFolder;
+ private int pagesCounter = 0;
+ private int scrapeVersion = 1;
+
+ private String sitemapUrl;
+ private String sitemapURLKey;
+ private String maxScrapedPages;
+ private String outputFilename;
+
+ private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
+
+ private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
+
+ public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) {
+ this.sitemapUrl = sitemapUrl;
+ this.sitemapURLKey = sitemapURLKey;
+ this.maxScrapedPages = maxScrapedPages;
+ this.outputFilename = outputFilename;
+ }
+
+ /**
+ * Runs the scrape process
+ *
+ */
+ public void start() throws IOException {
+ runScrape();
+ }
+
+ /**
+ * Fires off threads
+ * Originally designed as a multi-threaded process; now reduced to a single thread as
+ * the selenium webdriver is too expensive to run multi-threaded. However, the threading
+ * as been left in situ in case it is useful in the future.
+ *
+ */
+ private void runScrape() throws IOException {
+ processProperties();
+ String url = sitemapUrl.toLowerCase();
+ Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
+ Stream urlStream = null;
+ if (Objects.nonNull(maxScrapedPages)) {
+ urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
+ } else {
+ urlStream = urls.stream();
+ }
+ List sites = urlStream.collect(Collectors.toList());
+ logger.info("Pages available for scraping: " + sites.size());
+
+ List pagesToPull = generatePagesToPull(sites);
+ if (pagesToPull.isEmpty()) {
+ logger.error("Cannot retrieve URLs");
+ throw new RuntimeException("No pages found from sitemap");
+ }
+
+ ScrapeState scrapeState = new ScrapeState(pagesToPull);
+
+ logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
+ while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
+ logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
+
+ ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
+ scrape1.setName("S1");
+
+// ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
+// scrape2.setName("S2");
+//
+// ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
+// scrape3.setName("S3");
+//
+// ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
+// scrape4.setName("S4");
+
+ scrape1.start();
+// scrape2.start();
+// scrape3.start();
+// scrape4.start();
+ long startTime = System.nanoTime();
+
+ try {
+ scrape1.join();
+// scrape2.join();
+// scrape3.join();
+// scrape4.join();
+ } catch (InterruptedException e) {
+ logger.error("Exception waiting on thread");
+ e.printStackTrace();
+ return;
+ }
+
+ if(!scrape1.isFileWritten()) {
+ logger.error("Could not write output file so shutting down!");
+ Date date = new Date(System.currentTimeMillis());
+ logger.info("ENDING CRAWL after failure at: " + formatter.format(date));
+ return;
+ }
+
+ logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
+
+ long endTime = System.nanoTime();
+ long timeElapsed = endTime - startTime;
+ logger.info("Time in s to complete: " + timeElapsed / 1e+9);
+
+ updateDatabase(scrapeState);
+ pagesCounter += numberOfPagesToCrawlInALoop;
+
+
+ logger.info("ENDED loop");
+ }
+
+// Map nquads = scrapeState.getNquadsConcurrentHashMap();
+// logger.info("Available nquads records: "+nquads.size() );
+
+ logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
+
+ File output = new File(outputFolder.concat("/").concat(outputFilename));
+ if (output.exists()) {
+ output.delete();
+ output.createNewFile();
+ }
+ FileWriter fileWriter;
+ BufferedWriter bufferedWriter;
+ fileWriter = new FileWriter(output.getAbsoluteFile(), true); // true to append
+ bufferedWriter = new BufferedWriter(fileWriter);
+
+ List processed = scrapeState.getPagesProcessed();
+ for (int i=0;i generatePagesToPull(List sites) {
+ List crawls = sites
+ .stream()
+ .map(s -> {
+ CrawlRecord crawlRecord = new CrawlRecord(s.text());
+ String[] urlSplitted = crawlRecord.getUrl().split("/");
+ String name = urlSplitted[urlSplitted.length - 1];
+ crawlRecord.setName(name);
+ return crawlRecord;
+ })
+ .collect(Collectors.toList());
+ return crawls;
+ }
+
+ /**
+ * Updates properties based on properties file in src > main > resources
+ *
+ */
+ private void processProperties() {
+ ClassLoader classLoader = ServiceScrapeDriver.class.getClassLoader();
+
+ InputStream is = classLoader.getResourceAsStream(propertiesFile);
+ if(is == null) {
+ logger.error(" Cannot find " + propertiesFile + " file");
+ throw new IllegalArgumentException(propertiesFile + "file is not found!");
+ }
+
+ Properties prop = new Properties();
+
+ try {
+ prop.load(is);
+ } catch (IOException e) {
+ logger.error(" Cannot load application.properties", e);
+ System.exit(0);
+ }
+
+ waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
+ logger.info(" waitTime: " + waitTime);
+ outputFolder = prop.getProperty("outputFolder").trim();
+ logger.info(" outputFolder: " + outputFolder);
+ numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
+ logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
+ totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());
+ logger.info(" totalNumberOfPagesToCrawlInASession: " + totalNumberOfPagesToCrawlInASession);
+ scrapeVersion = Integer.parseInt(prop.getProperty("scrapeVersion").trim());
+ logger.info(" scrapeVersion: " + scrapeVersion);
+ logger.info("\n\n\n");
+ }
+
+ public String getSitemapUrl() {
+ return sitemapUrl;
+ }
+
+ public String getSitemapURLKey() {
+ return sitemapURLKey;
+ }
+
+ private String getId(String pageUrl) {
+ String[] parts = pageUrl.split("/");
+ return parts[parts.length - 1];
+ }
+
+
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java
new file mode 100644
index 00000000..e711d745
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java
@@ -0,0 +1,136 @@
+package eu.dnetlib.bmuse_webapp.crawl;
+
+import java.util.Date;
+
+import hwu.elixir.utils.Validation;
+
+
+/**
+ *
+ * Store the current status of a single URL in the scrape service.
+ *
+ *
+ */
+
+
+public class CrawlRecord {
+
+ private Long id;
+
+ private String context = "";
+
+ private String url;
+
+ private Date dateScraped;
+
+ private StatusOfScrape status;
+
+ private boolean beingScraped;
+
+ private String name;
+
+ private String nquads;
+
+ public CrawlRecord() {
+ status = StatusOfScrape.UNTRIED;
+ }
+
+ public CrawlRecord(String url) {
+ Validation validation = new Validation();
+ if(validation.validateURI(url)) {
+ this.url = url;
+ context = "";
+ status = StatusOfScrape.UNTRIED;
+ dateScraped = null;
+ } else {
+ throw new IllegalArgumentException(url +" is not a valid url");
+ }
+ this.setId(System.currentTimeMillis());
+ }
+
+ public Long getId() {
+ return id;
+ }
+
+ public void setId(Long id) {
+ this.id = id;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public Date getDateScraped() {
+ return dateScraped;
+ }
+
+ public void setDateScraped(Date dateScraped) {
+ this.dateScraped = dateScraped;
+ }
+
+ public StatusOfScrape getStatus() {
+ return status;
+ }
+
+ public void setStatus(StatusOfScrape status) {
+ this.status = status;
+ }
+
+ public String getContext() {
+ return context;
+ }
+
+ public void setContext(String context) {
+ this.context = context;
+ }
+
+ public boolean isBeingScraped() {
+ return beingScraped;
+ }
+
+ public void setBeingScraped(boolean beingScraped) {
+ this.beingScraped = beingScraped;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public String getNquads() {
+ return nquads;
+ }
+
+ public void setNquads(String nquads) {
+ this.nquads = nquads;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o)
+ return true;
+ if (!(o instanceof CrawlRecord))
+ return false;
+
+ CrawlRecord otherCrawl = (CrawlRecord) o;
+
+ if(this.url.equals(otherCrawl.getUrl())) {
+ return true;
+ }
+
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = getId() != null ? getId().hashCode() : 0;
+ result = 31 * result + (getUrl() != null ? getUrl().hashCode() : 0);
+ result = 31 * result + (getContext() != null ? getContext().hashCode() : 0);
+ result = 31 * result + (getDateScraped() != null ? getDateScraped().hashCode() : 0);
+ return result;
+ }
+
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java
new file mode 100644
index 00000000..9ecd7ba1
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java
@@ -0,0 +1,19 @@
+package eu.dnetlib.bmuse_webapp.crawl;
+
+/**
+ *
+ * {@link eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape} describes the possible status levels the scrape for each URL/CrawlRecord.
+ *
+ * Each URL/CrawlRecord can have one of the following:
+ * DOES_NOT_EXIST = 404.
+ * HUMAN_INSPECTION = cannot parse for some reason; a human should see what is happening.
+ * UNTRIED = not scraped yet.
+ * FAILED = one failed attempt at scraping; will try again.
+ * GIVEN_UP = two failed attempts at scraping. Will not try again.
+ * SUCCESS = successfully scraped.
+ *
+ */
+
+public enum StatusOfScrape {
+ DOES_NOT_EXIST, HUMAN_INSPECTION, UNTRIED, FAILED, GIVEN_UP, SUCCESS;
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java
new file mode 100644
index 00000000..22beeb97
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java
@@ -0,0 +1,65 @@
+package eu.dnetlib.bmuse_webapp.publisher;
+
+import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
+import eu.dnetlib.common.controller.AbstractDnetController;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.LineIterator;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tomcat.jni.FileInfo;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.web.bind.annotation.*;
+
+import javax.servlet.http.HttpServletResponse;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+/**
+ * @author enrico.ottonello
+ *
+ */
+
+@RestController
+@RequestMapping("/api")
+public class BMUSEWebappController extends AbstractDnetController {
+
+ private static final Log log = LogFactory.getLog(BMUSEWebappController.class);
+
+ @RequestMapping(value = "/version", method = RequestMethod.GET)
+ public String version() throws BMUSEWebappException {
+ return "1.0.0-SNAPSHOT";
+ }
+
+ @RequestMapping(value = "/scrape", method = RequestMethod.GET)
+ public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
+
+ log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
+// String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
+// String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
+// String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
+ String sitemapUrlKey = "loc";
+ String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt");
+ ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
+ service.start();
+ return "started";
+ }
+
+ @RequestMapping(value = "/nquads", method = RequestMethod.GET)
+ public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException {
+ LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8");
+ try {
+ while (it.hasNext()) {
+ String line = it.nextLine();
+ response.getOutputStream().write(line.getBytes(StandardCharsets.UTF_8));
+ response.getOutputStream().println();
+ }
+ } finally {
+ }
+ return "";
+ }
+}
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java
new file mode 100644
index 00000000..9687ebfa
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java
@@ -0,0 +1,28 @@
+package eu.dnetlib.bmuse_webapp.publisher;
+
+/**
+ * @author enrico.ottonello
+ *
+ */
+
+public class BMUSEWebappException extends Exception{
+
+ public BMUSEWebappException() {
+ }
+
+ public BMUSEWebappException(final String message) {
+ super(message);
+ }
+
+ public BMUSEWebappException(final String message, final Throwable cause) {
+ super(message, cause);
+ }
+
+ public BMUSEWebappException(final Throwable cause) {
+ super(cause);
+ }
+
+ public BMUSEWebappException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) {
+ super(message, cause, enableSuppression, writableStackTrace);
+ }
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java
new file mode 100644
index 00000000..ab7143f2
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java
@@ -0,0 +1,17 @@
+package eu.dnetlib.bmuse_webapp.publisher;
+
+import eu.dnetlib.common.controller.AbstractDnetController;
+import org.springframework.stereotype.Controller;
+import org.springframework.web.bind.annotation.GetMapping;
+
+@Controller
+public class HomeController extends AbstractDnetController {
+
+ @GetMapping({
+ "/doc", "/swagger"
+ })
+ public String apiDoc() {
+ return "redirect:swagger-ui/";
+ }
+
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java
new file mode 100644
index 00000000..4203f18a
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java
@@ -0,0 +1,90 @@
+
+package eu.dnetlib.bmuse_webapp.scraper;
+
+import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
+import hwu.elixir.scrape.exceptions.MissingMarkupException;
+import hwu.elixir.scrape.scraper.ScraperFilteredCore;
+import org.apache.any23.Any23;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.source.DocumentSource;
+import org.apache.any23.source.StringDocumentSource;
+import org.apache.any23.writer.NTriplesWriter;
+import org.apache.any23.writer.TripleHandler;
+import org.apache.any23.writer.TripleHandlerException;
+import org.apache.commons.io.output.ByteArrayOutputStream;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Model;
+import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
+import org.eclipse.rdf4j.rio.RDFFormat;
+import org.eclipse.rdf4j.rio.Rio;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.StringWriter;
+
+public class BMUSEScraper extends ScraperFilteredCore {
+
+ private static final Log logger = LogFactory.getLog(BMUSEScraper.class);
+
+ public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
+ logger.debug(url + " > scraping");
+ url = fixURL(url);
+
+ String html = "";
+ // The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
+ // (dynamic and static respectively)
+
+ if (dynamic) {
+ html = wrapHTMLExtraction(url);
+ } else {
+ html = wrapHTMLExtractionStatic(url);
+ }
+
+ if (html == null || html.contentEquals(""))
+ throw new Exception("empty html");
+
+ html = injectId(html, url);
+
+ logger.debug(url + " > html scraped from " + url);
+ DocumentSource source = new StringDocumentSource(html, url);
+ String n3 = html2Triples(source, url);
+ if (n3 == null) {
+ throw new MissingMarkupException(url);
+ }
+
+ logger.debug(url + " > processing triples");
+ IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
+ Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
+ if (updatedModel == null) {
+ throw new Exception("rdf model null");
+ }
+
+ logger.debug(url + " > generating nquads");
+ try (StringWriter jsonLDWriter = new StringWriter()) {
+ Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
+ logger.debug(url + " > nquads generated");
+ return jsonLDWriter.toString();
+ } catch (Exception e) {
+ throw e;
+ }
+ }
+
+ private String html2Triples(DocumentSource source, String url) throws Exception {
+ Any23 runner = new Any23();
+ try (ByteArrayOutputStream out = new ByteArrayOutputStream();
+ TripleHandler handler = new NTriplesWriter(out);) {
+ runner.extract(source, handler);
+ return out.toString("UTF-8");
+ } catch (ExtractionException e) {
+ logger.error("Cannot extract triples", e);
+ } catch (IOException e1) {
+ logger.error(" IO error whilst extracting triples", e1);
+ } catch (TripleHandlerException e2) {
+ logger.error("TripleHanderException", e2);
+ }
+ return null;
+ }
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java
new file mode 100644
index 00000000..11aababe
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java
@@ -0,0 +1,157 @@
+package eu.dnetlib.bmuse_webapp.scraper;
+
+import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
+import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+
+ */
+public class ScrapeState {
+
+ private List urlsToScrape = Collections.synchronizedList(new ArrayList());
+ private List urlsProcessed = Collections.synchronizedList(new ArrayList()); // should this be a set?
+ private Map nquadsConcurrentHashMap = new ConcurrentHashMap<>();
+
+ /**
+ *
+ * @param pagesToBeScraped The list of sites to be scraped
+ * @see ScrapeThread
+ * @see CrawlRecord
+ */
+ public ScrapeState(List pagesToBeScraped) {
+ urlsToScrape.addAll(pagesToBeScraped);
+ }
+
+ /**
+ * Any pages/URLs left to scrape?
+ * @return True for yes & false for no
+ * @see CrawlRecord
+ */
+ public synchronized boolean pagesLeftToScrape() {
+ return !urlsToScrape.isEmpty();
+ }
+
+ /**
+ * Returns the next URL/CrawlRecord to be scraped
+ *
+ * @return First page/URL that needs to be scraped next
+ * @see CrawlRecord
+ */
+ public synchronized CrawlRecord getURLToProcess() {
+ if (urlsToScrape.isEmpty())
+ return null;
+
+ return urlsToScrape.remove(0);
+ }
+
+ /**
+ * Adds the given CrawlRecord to the list of CrawlRecords successfully scraped.
+ * Updates the status of the CrawlRecord to SUCCESS.
+ *
+ * @param url The latest URL/page that has been successfully scraped
+ * @see CrawlRecord
+ */
+ public synchronized void addSuccessfulScrapedURL(CrawlRecord record) {
+ record.setStatus(StatusOfScrape.SUCCESS);
+ urlsProcessed.add(record);
+ }
+
+ /**
+ * Adds the given CrawlRecord to the list of CrawlRecords NOT successfully scraped.
+ * Updates the status of the CrawlRecord; if first failure the status is FAILED.
+ * If status is already FAILED it is changed to GIVEN_UP.
+ *
+ * If the status is FAILED, another try will be made in a future run.
+ *
+ *
+ * @param url The latest URL/page that has been unsuccessfully scraped
+ * @see CrawlRecord
+ */
+ public synchronized void addFailedToScrapeURL(CrawlRecord record) {
+ if (record.getStatus().equals(StatusOfScrape.FAILED)) {
+ record.setStatus(StatusOfScrape.GIVEN_UP);
+ } else {
+ record.setStatus(StatusOfScrape.FAILED);
+ }
+ urlsProcessed.add(record);
+ }
+
+ /**
+ * Changes the status of the CrawlRecord to DOES_NOT_EXIST.
+ * As Selenium does not return the HTTP codes, it is questionable
+ * how useful this is.
+ *
+ *
+ * @param url The latest URL/page that has been 404'd
+ * @see CrawlRecord
+ */
+ public synchronized void setStatusTo404(CrawlRecord record) {
+ record.setStatus(StatusOfScrape.DOES_NOT_EXIST);
+ urlsProcessed.add(record);
+ }
+
+
+ /**
+ *
+ * Changes the status of the CrawlRecord to HUMAN_INSPECTION.
+ * This captures the idea that the URLs may contain unexpected markup that needs a human to
+ * review and possibly update the scraper.
+ *
+ * @param url The latest URL/page that needs human inspection
+ * @see CrawlRecord
+ */
+ public synchronized void setStatusToHumanInspection(CrawlRecord record) {
+ record.setStatus(StatusOfScrape.HUMAN_INSPECTION);
+ urlsProcessed.add(record);
+ }
+
+
+ /**
+ * Returns the number of URLs that are still to be scraped in this cycle.
+ * This does not return the number of URLs left to scrape in the DBMS, just in the current cycle.
+ *
+ * @return Number of URLs left to scrape in this cycle
+ * @see CrawlRecord
+ */
+ public synchronized int getNumberPagesLeftToScrape() {
+ return urlsToScrape.size();
+ }
+
+ /**
+ * Gets the full list of URLs that have been processed in this cycle.
+ * This does not return the number of URLs that have been scraped in total across all cycles.
+ *
+ * @return
+ * @see CrawlRecord
+ */
+ public synchronized List getPagesProcessed() {
+ return urlsProcessed;
+ }
+
+ /**
+ * Gets the full list of URLs/CrawlRecords regardless of whether scraped or not in the current cycle.
+ *
+ * @return List of all CrawlRecords in this cycle.
+ * @see CrawlRecord
+ */
+ public synchronized List getPagesProcessedAndUnprocessed() {
+ List urlsCombined = Collections.synchronizedList(new ArrayList());
+ urlsCombined.addAll(urlsProcessed);
+ urlsCombined.addAll(urlsToScrape);
+ return urlsCombined;
+ }
+
+ public void addNquads(String key, String nquads) {
+ nquadsConcurrentHashMap.putIfAbsent(key, nquads);
+ }
+
+ public Map getNquadsConcurrentHashMap() {
+ return nquadsConcurrentHashMap;
+ }
+}
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java
new file mode 100644
index 00000000..7fe1ef88
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java
@@ -0,0 +1,109 @@
+package eu.dnetlib.bmuse_webapp.scraper;
+
+import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
+import eu.dnetlib.bmuse_webapp.utils.CompressorUtil;
+import hwu.elixir.scrape.exceptions.CannotWriteException;
+import hwu.elixir.scrape.exceptions.FourZeroFourException;
+import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
+import hwu.elixir.scrape.exceptions.MissingMarkupException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+//import org.apache.commons.logging.Log;
+//import org.apache.commons.logging.LogFactory;
+//import org.slf4j.Logger;
+//import org.slf4j.LoggerFactory;
+
+import java.util.Date;
+
+/**
+ *
+ * @see BMUSEScraper
+ * @see ScrapeState
+ *
+ */
+public class ScrapeThread extends Thread {
+ private ScrapeState scrapeState;
+ private BMUSEScraper process;
+ private int waitTime;
+ private boolean fileWritten = true;
+ private int scrapeVersion = 1;
+
+ private static final Log logger = LogFactory.getLog(ScrapeThread.class);
+
+ /**
+ * Sets up a thread for actually scrapping.
+ *
+ * @param scraper Scraper that will actually do the scraping.
+ * @param scrapeState Object that maintains state across threads.
+ * @param waitTime How long (in seconds) thread should wait after scraping
+ * page before attempting new page.
+ * @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled.
+ *
+ */
+ public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) {
+ this.scrapeState = scrapeState;
+ process = scraper;
+ this.waitTime = waitTime;
+ this.scrapeVersion = contextVersion;
+ }
+
+ @Override
+ /**
+ * Defines high-level process of scraping. Actual scraping done by an
+ * implementation of Scraper. If page scrape successful will add url to
+ * Scrape.sitesScraped
+ *
+ * @see Scraper
+ * @see SimpleScraper
+ */
+ public void run() {
+ while (scrapeState.pagesLeftToScrape()) {
+ CrawlRecord record = scrapeState.getURLToProcess();
+
+ if (record == null)
+ break;
+
+ record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId());
+ record.setDateScraped(new Date());
+
+ try {
+ String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
+// scrapeState.addNquads(record.getName(), nquads);
+ logger.info("downloaded "+record.getUrl());
+ record.setNquads(CompressorUtil.compressValue(nquads));
+ if (!nquads.isEmpty()) {
+ scrapeState.addSuccessfulScrapedURL(record);
+ } else {
+ scrapeState.addFailedToScrapeURL(record);
+ }
+ } catch(FourZeroFourException fourZeroFourException) {
+ scrapeState.setStatusTo404(record);
+ fileWritten = false;
+ } catch (JsonLDInspectionException je) {
+ scrapeState.setStatusToHumanInspection(record);
+ fileWritten = false;
+ } catch (CannotWriteException cannotWrite) {
+ logger.error("Caught cannot read file, setting worked to false!");
+ fileWritten = false;
+ scrapeState.addFailedToScrapeURL(record);
+ return; // no point in continuing
+ } catch (MissingMarkupException e) {
+ logger.error("Cannot obtain markup from " + record.getUrl() +".");
+ fileWritten = false;
+ scrapeState.addFailedToScrapeURL(record);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ try {
+ ScrapeThread.sleep(100 * waitTime);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ process.shutdown();
+ }
+
+ public boolean isFileWritten() {
+ return fileWritten;
+ }
+}
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java
new file mode 100644
index 00000000..b13e9cb6
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java
@@ -0,0 +1,72 @@
+package eu.dnetlib.bmuse_webapp.scraper;
+
+import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
+import hwu.elixir.scrape.exceptions.*;
+import hwu.elixir.scrape.scraper.ScraperFilteredCore;
+import org.apache.commons.lang.time.DateUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Provides the
+ * actual scraping functionality.
+ *
+ * Scrapes a given URL, converts into NQuads and writes to a file (name derived
+ * from URL). If the file already exists it will be overwritten.
+ *
+ *
+ * @see ScraperFilteredCore
+ *
+ */
+public class ServiceScraper extends ScraperFilteredCore {
+
+ private static Logger logger = LoggerFactory.getLogger(System.class.getName());
+
+ private StatusOfScrape status= null;
+
+
+ /**
+ * Orchestrates the process of scraping a site before converting the extracted
+ * triples to NQuads and writing to a file.
+ *
+ * @param url Site to be scraped
+ * @param contextCounter Number used to generate the named graph/context and
+ * the URLs used to replace blank nodes.
+ * @param outputFolderName Location to which the NQuads will be written
+ * @return True if success; false otherwise
+ * @throws FourZeroFourException
+ * @throws JsonLDInspectionException
+ * @throws CannotWriteException
+ * @throws MissingMarkupException
+ *
+ */
+ public boolean scrape(String url, Long contextCounter, String outputFolderName, String fileName, StatusOfScrape status) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException {
+ this.status = status;
+ logger.info("scraping "+url + " to "+fileName);
+ return scrape(url, outputFolderName, fileName, contextCounter, true);
+ }
+
+
+
+ @Override
+ /* Now takes account of StateOfCrawl
+ */
+ protected String wrapHTMLExtraction(String url) throws FourZeroFourException {
+ String html = "";
+ if (status.equals(StatusOfScrape.UNTRIED) || status.equals(StatusOfScrape.FAILED)) {
+ try {
+ html = getHtmlViaSelenium(url);
+ } catch (SeleniumException e) {
+ // try again
+ try {
+ html = getHtmlViaSelenium(url);
+ } catch (SeleniumException e2) {
+ return "";
+ }
+ }
+ } else {
+ return "";
+ }
+ return html;
+ }
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java
new file mode 100644
index 00000000..cab5102a
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java
@@ -0,0 +1,34 @@
+package eu.dnetlib.bmuse_webapp.utils;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.io.IOUtils;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.StringWriter;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+public class CompressorUtil {
+
+ public static String decompressValue(final String abstractCompressed) {
+ try {
+ byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
+ GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
+ final StringWriter stringWriter = new StringWriter();
+ IOUtils.copy(gis, stringWriter);
+ return stringWriter.toString();
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ public static String compressValue(final String value) throws IOException {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ GZIPOutputStream gzip = new GZIPOutputStream(out);
+ gzip.write(value.getBytes());
+ gzip.close();
+ return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
+ }
+}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java
new file mode 100644
index 00000000..39c64791
--- /dev/null
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java
@@ -0,0 +1,64 @@
+
+package eu.dnetlib.bmuse_webapp.utils;
+
+import hwu.elixir.utils.Helpers;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+public class UrlParser {
+
+ private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
+
+ public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
+
+ Document doc = new Document(url);
+ Document urlSitemapListsNested;
+ Elements elements = new Elements();
+ Elements sitemaps = new Elements();
+ boolean sitemapindex = false;
+ boolean urlset = false;
+
+ try {
+ int urlLength = url.length();
+ logger.info("parse sitemap list");
+ String sitemapExt = url.substring(urlLength - 3, urlLength);
+ if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
+ logger.info("compressed sitemap");
+ byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
+ doc = Helpers.gzipFileDecompression(bytes);
+ } else {
+ doc = Jsoup.connect(url).maxBodySize(0).get();
+ }
+
+ } catch (IOException e) {
+ logger.error("Jsoup parsing exception: " + e.getMessage());
+ }
+
+ try {
+
+ elements = doc.select(sitemapURLKey);
+
+ // check the html if it is a sitemapindex or a urlset
+ sitemapindex = doc.outerHtml().contains("sitemapindex");
+ urlset = doc.outerHtml().contains("urlset");
+ } catch (NullPointerException e) {
+ logger.error(e.getMessage());
+ }
+
+ if (sitemapindex) {
+ // if sitemapindex get the loc of all the sitemaps
+ // added warning for sitemap index files
+ logger
+ .warn(
+ "please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
+ sitemaps = doc.select(sitemapURLKey);
+ }
+
+ return elements;
+ }
+}
diff --git a/apps/bioschemas-api/src/main/resources/application.properties b/apps/bioschemas-api/src/main/resources/application.properties
new file mode 100644
index 00000000..ce5f349b
--- /dev/null
+++ b/apps/bioschemas-api/src/main/resources/application.properties
@@ -0,0 +1,24 @@
+server.servlet.context-path=/dnet-bmuse-webapp
+server.port=8281
+
+spring.profiles.active=garr
+
+logging.file.name = /var/log/springboot/9480/oa_organizations.log
+
+maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
+
+spring.main.banner-mode = off
+
+logging.level.root = INFO
+
+management.endpoints.web.exposure.include = prometheus,health
+management.endpoints.web.base-path = /
+management.endpoints.web.path-mapping.prometheus = metrics
+management.endpoints.web.path-mapping.health = health
+
+waitTime=5
+outputFolder=/Users/enrico.ottonello/data/bmuse-output
+numberOfPagesToCrawlInALoop=8
+totalNumberOfPagesToCrawlInASession=32
+chromiumDriverLocation = /usr/local/bin/chromedriver
+scrapeVersion=1
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/resources/logback-spring.xml b/apps/bioschemas-api/src/main/resources/logback-spring.xml
new file mode 100644
index 00000000..3c5e86fe
--- /dev/null
+++ b/apps/bioschemas-api/src/main/resources/logback-spring.xml
@@ -0,0 +1,30 @@
+
+
+
+ /var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log
+
+ %d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n
+
+
+ /var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log
+ 10MB
+ 10
+ 100MB
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/apps/pom.xml b/apps/pom.xml
index 363953d6..4e36cc99 100644
--- a/apps/pom.xml
+++ b/apps/pom.xml
@@ -18,6 +18,7 @@
dnet-orgs-database-application
dnet-exporter-api
scholexplorer-api
+ bioschemas-api
diff --git a/pom.xml b/pom.xml
index 413986b3..ea4c6e21 100644
--- a/pom.xml
+++ b/pom.xml
@@ -88,6 +88,18 @@
Cloudera Repository
https://repository.cloudera.com/artifactory/cloudera-repos
+
+ dnet-deps
+ D-Net Dependencies
+ https://maven.d4science.org/nexus/content/repositories/dnet-deps/
+
+ true
+
+
+ false
+
+ default
+