From 73755347646e710222db19543fe4e38820656c5f Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 6 Jun 2022 09:37:29 +0200 Subject: [PATCH] added app for bioschemas sources harvesting --- apps/bioschemas-api/deploy.info | 10 + apps/bioschemas-api/pom.xml | 86 ++++++ apps/bioschemas-api/pom.xml.original | 173 ++++++++++++ .../dnetlib/bmuse_webapp/AppConfigGarr.java | 45 +++ .../dnetlib/bmuse_webapp/MainApplication.java | 42 +++ .../bmuse_webapp/ServiceScrapeDriver.java | 261 ++++++++++++++++++ .../bmuse_webapp/crawl/CrawlRecord.java | 136 +++++++++ .../bmuse_webapp/crawl/StatusOfScrape.java | 19 ++ .../publisher/BMUSEWebappController.java | 65 +++++ .../publisher/BMUSEWebappException.java | 28 ++ .../publisher/HomeController.java | 17 ++ .../bmuse_webapp/scraper/BMUSEScraper.java | 90 ++++++ .../bmuse_webapp/scraper/ScrapeState.java | 157 +++++++++++ .../bmuse_webapp/scraper/ScrapeThread.java | 109 ++++++++ .../bmuse_webapp/scraper/ServiceScraper.java | 72 +++++ .../bmuse_webapp/utils/CompressorUtil.java | 34 +++ .../dnetlib/bmuse_webapp/utils/UrlParser.java | 64 +++++ .../src/main/resources/application.properties | 24 ++ .../src/main/resources/logback-spring.xml | 30 ++ apps/pom.xml | 1 + pom.xml | 12 + 21 files changed, 1475 insertions(+) create mode 100644 apps/bioschemas-api/deploy.info create mode 100644 apps/bioschemas-api/pom.xml create mode 100644 apps/bioschemas-api/pom.xml.original create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java create mode 100644 apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java create mode 100644 apps/bioschemas-api/src/main/resources/application.properties create mode 100644 apps/bioschemas-api/src/main/resources/logback-spring.xml diff --git a/apps/bioschemas-api/deploy.info b/apps/bioschemas-api/deploy.info new file mode 100644 index 00000000..015b818c --- /dev/null +++ b/apps/bioschemas-api/deploy.info @@ -0,0 +1,10 @@ +{ + "type_source": "SVN", + "goal": "package -U source:jar", + "url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk/", + "deploy_repository": "dnet5-snapshots", + "version": "5", + "mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it, enrico.ottonello@isti.cnr.it", + "deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots", + "name": "dnet-ariadneplus-graphdb-publisher" +} \ No newline at end of file diff --git a/apps/bioschemas-api/pom.xml b/apps/bioschemas-api/pom.xml new file mode 100644 index 00000000..4bcb0408 --- /dev/null +++ b/apps/bioschemas-api/pom.xml @@ -0,0 +1,86 @@ + + + + eu.dnetlib.dhp + apps + 3.2.8-SNAPSHOT + ../pom.xml + + + 4.0.0 + jar + bioschemas-api + + + + org.springframework.boot + spring-boot-starter-test + test + + + hwu.elixir + bmuse-core + 0.5.4 + + + org.freemarker + freemarker + 2.3.27-incubating + + + + org.apache.any23 + apache-any23-core + 2.3 + + + org.eclipse.rdf4j + rdf4j-rio-rdfxml + 3.7.1 + + + org.eclipse.rdf4j + rdf4j-model + 3.7.1 + + + + org.jsoup + jsoup + 1.13.1 + + + org.seleniumhq.selenium + selenium-java + 3.141.59 + + + commons-io + commons-io + 2.6 + + + commons-validator + commons-validator + 1.6 + + + ch.qos.logback + logback-classic + 1.2.3 + + + + + + org.apache.maven.plugins + maven-help-plugin + + + org.springframework.boot + spring-boot-maven-plugin + + + + + \ No newline at end of file diff --git a/apps/bioschemas-api/pom.xml.original b/apps/bioschemas-api/pom.xml.original new file mode 100644 index 00000000..f79644f5 --- /dev/null +++ b/apps/bioschemas-api/pom.xml.original @@ -0,0 +1,173 @@ + + + + org.springframework.boot + spring-boot-starter-parent + 2.1.3.RELEASE + + + 4.0.0 + eu.dnetlib + dnet-bmuse-webapp + jar + 1.0.0-SNAPSHOT + + scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk + https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp + + + jenkins + https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/ + + + + dnet5-releases + D-Net 5 Releases + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases + default + + + + + + + + dnet-deps + D-Net Dependencies + https://maven.d4science.org/nexus/content/repositories/dnet-deps/ + + true + + + false + + default + + + dnet5-releases + D-Net 5 Releases + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases + default + + false + + + + dnet5-snapshots + D-Net 5 Snapshots + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots + default + + true + + + + + + + junit + junit + 4.13-rc-1 + test + + + org.springframework.boot + spring-boot-starter-test + test + + + org.springframework.boot + spring-boot-autoconfigure + + + org.springframework.boot + spring-boot + + + org.springframework.boot + spring-boot-starter-web + + + hwu.elixir + bmuse-core + 0.5.4 + + + org.freemarker + freemarker + 2.3.27-incubating + + + + org.apache.any23 + apache-any23-core + 2.3 + + + org.eclipse.rdf4j + rdf4j-rio-rdfxml + 3.7.1 + + + org.eclipse.rdf4j + rdf4j-model + 3.7.1 + + + + org.jsoup + jsoup + 1.13.1 + + + org.seleniumhq.selenium + selenium-java + 3.141.59 + + + commons-io + commons-io + 2.6 + + + commons-validator + commons-validator + 1.6 + + + ch.qos.logback + logback-classic + 1.2.3 + + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + true + + + + + + + 1.8 + false + + + + + java8-doclint-disabled + + [1.8,) + + + -Xdoclint:none + + + + \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java new file mode 100644 index 00000000..634f9172 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java @@ -0,0 +1,45 @@ +package eu.dnetlib.bmuse_webapp; + +import org.springframework.boot.web.client.RestTemplateBuilder; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Profile; +import org.springframework.web.client.RestTemplate; + +import freemarker.cache.ClassTemplateLoader; +import freemarker.template.TemplateExceptionHandler; + +/** + * @author enrico.ottonello + * + */ +@Profile("garr") +@Configuration +public class AppConfigGarr { + + @Bean + public RestTemplate jrrRestTemplate(){ + //TODO: move configuration here from CatalogueRegistrator? + return new RestTemplateBuilder().build(); + } + + + @Bean + public freemarker.template.Configuration freemarkerConfig(){ + freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27); + ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql"); + config.setTemplateLoader(ctl); + config.setDefaultEncoding("UTF-8"); + // Sets how errors will appear. + // During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better. + config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER); + + // Don't log exceptions inside FreeMarker that it will thrown at you anyway: + config.setLogTemplateExceptions(false); + + // Wrap unchecked exceptions thrown during template processing into TemplateException-s. + config.setWrapUncheckedExceptions(true); + + return config; + } +} \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java new file mode 100644 index 00000000..44bbd5ad --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/MainApplication.java @@ -0,0 +1,42 @@ +package eu.dnetlib.bmuse_webapp; + +import eu.dnetlib.common.app.AbstractDnetApp; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.cache.annotation.EnableCaching; +import org.springframework.context.annotation.ComponentScan; +import org.springframework.scheduling.annotation.EnableScheduling; +import springfox.documentation.builders.ApiInfoBuilder; +import springfox.documentation.builders.RequestHandlerSelectors; +import springfox.documentation.service.ApiInfo; +import springfox.documentation.spring.web.plugins.Docket; +import springfox.documentation.swagger2.annotations.EnableSwagger2; + +@SpringBootApplication +@EnableSwagger2 +@EnableCaching +@EnableScheduling +@ComponentScan(basePackages = "eu.dnetlib") +public class MainApplication extends AbstractDnetApp { + + public static void main(final String[] args) { + SpringApplication.run(MainApplication.class, args); + } + + @Override + protected void configSwagger(final Docket docket) { + docket.select() + .apis(RequestHandlerSelectors.any()) + .paths(p -> p.contains("/api/")) + .build() + .apiInfo(new ApiInfoBuilder() + .title("D-Net Bioschemas Service APIs") + .description("APIs documentation") + .version("1.1") + .contact(ApiInfo.DEFAULT_CONTACT) + .license("Apache 2.0") + .licenseUrl("http://www.apache.org/licenses/LICENSE-2.0") + .build()); + } + +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java new file mode 100644 index 00000000..3b1ab451 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java @@ -0,0 +1,261 @@ +package eu.dnetlib.bmuse_webapp; + +import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord; +import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper; +import eu.dnetlib.bmuse_webapp.scraper.ScrapeState; +import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread; +import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper; +import eu.dnetlib.bmuse_webapp.utils.UrlParser; +import hwu.elixir.utils.Helpers; +import org.apache.commons.io.FileUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.charset.Charset; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.stream.Collectors; +import java.util.stream.Stream; + + +/** + * Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape. + * Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL) + * and adds provenance to the CrawlRecord. + * + * + */ +public class ServiceScrapeDriver { + + private static final String propertiesFile = "application.properties"; + + private int waitTime = 1; + private int numberOfPagesToCrawlInALoop; + private int totalNumberOfPagesToCrawlInASession; + private String outputFolder; + private int pagesCounter = 0; + private int scrapeVersion = 1; + + private String sitemapUrl; + private String sitemapURLKey; + private String maxScrapedPages; + private String outputFilename; + + private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z"); + + private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class); + + public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) { + this.sitemapUrl = sitemapUrl; + this.sitemapURLKey = sitemapURLKey; + this.maxScrapedPages = maxScrapedPages; + this.outputFilename = outputFilename; + } + + /** + * Runs the scrape process + * + */ + public void start() throws IOException { + runScrape(); + } + + /** + * Fires off threads + * Originally designed as a multi-threaded process; now reduced to a single thread as + * the selenium webdriver is too expensive to run multi-threaded. However, the threading + * as been left in situ in case it is useful in the future. + * + */ + private void runScrape() throws IOException { + processProperties(); + String url = sitemapUrl.toLowerCase(); + Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey()); + Stream urlStream = null; + if (Objects.nonNull(maxScrapedPages)) { + urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages)); + } else { + urlStream = urls.stream(); + } + List sites = urlStream.collect(Collectors.toList()); + logger.info("Pages available for scraping: " + sites.size()); + + List pagesToPull = generatePagesToPull(sites); + if (pagesToPull.isEmpty()) { + logger.error("Cannot retrieve URLs"); + throw new RuntimeException("No pages found from sitemap"); + } + + ScrapeState scrapeState = new ScrapeState(pagesToPull); + + logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis()))); + while (pagesCounter < totalNumberOfPagesToCrawlInASession) { + logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession); + + ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion); + scrape1.setName("S1"); + +// ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder); +// scrape2.setName("S2"); +// +// ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder); +// scrape3.setName("S3"); +// +// ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder); +// scrape4.setName("S4"); + + scrape1.start(); +// scrape2.start(); +// scrape3.start(); +// scrape4.start(); + long startTime = System.nanoTime(); + + try { + scrape1.join(); +// scrape2.join(); +// scrape3.join(); +// scrape4.join(); + } catch (InterruptedException e) { + logger.error("Exception waiting on thread"); + e.printStackTrace(); + return; + } + + if(!scrape1.isFileWritten()) { + logger.error("Could not write output file so shutting down!"); + Date date = new Date(System.currentTimeMillis()); + logger.info("ENDING CRAWL after failure at: " + formatter.format(date)); + return; + } + + logger.debug("Value of isFileWritten: " + scrape1.isFileWritten()); + + long endTime = System.nanoTime(); + long timeElapsed = endTime - startTime; + logger.info("Time in s to complete: " + timeElapsed / 1e+9); + + updateDatabase(scrapeState); + pagesCounter += numberOfPagesToCrawlInALoop; + + + logger.info("ENDED loop"); + } + +// Map nquads = scrapeState.getNquadsConcurrentHashMap(); +// logger.info("Available nquads records: "+nquads.size() ); + + logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis()))); + + File output = new File(outputFolder.concat("/").concat(outputFilename)); + if (output.exists()) { + output.delete(); + output.createNewFile(); + } + FileWriter fileWriter; + BufferedWriter bufferedWriter; + fileWriter = new FileWriter(output.getAbsoluteFile(), true); // true to append + bufferedWriter = new BufferedWriter(fileWriter); + + List processed = scrapeState.getPagesProcessed(); + for (int i=0;i generatePagesToPull(List sites) { + List crawls = sites + .stream() + .map(s -> { + CrawlRecord crawlRecord = new CrawlRecord(s.text()); + String[] urlSplitted = crawlRecord.getUrl().split("/"); + String name = urlSplitted[urlSplitted.length - 1]; + crawlRecord.setName(name); + return crawlRecord; + }) + .collect(Collectors.toList()); + return crawls; + } + + /** + * Updates properties based on properties file in src > main > resources + * + */ + private void processProperties() { + ClassLoader classLoader = ServiceScrapeDriver.class.getClassLoader(); + + InputStream is = classLoader.getResourceAsStream(propertiesFile); + if(is == null) { + logger.error(" Cannot find " + propertiesFile + " file"); + throw new IllegalArgumentException(propertiesFile + "file is not found!"); + } + + Properties prop = new Properties(); + + try { + prop.load(is); + } catch (IOException e) { + logger.error(" Cannot load application.properties", e); + System.exit(0); + } + + waitTime = Integer.parseInt(prop.getProperty("waitTime").trim()); + logger.info(" waitTime: " + waitTime); + outputFolder = prop.getProperty("outputFolder").trim(); + logger.info(" outputFolder: " + outputFolder); + numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim()); + logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop); + totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim()); + logger.info(" totalNumberOfPagesToCrawlInASession: " + totalNumberOfPagesToCrawlInASession); + scrapeVersion = Integer.parseInt(prop.getProperty("scrapeVersion").trim()); + logger.info(" scrapeVersion: " + scrapeVersion); + logger.info("\n\n\n"); + } + + public String getSitemapUrl() { + return sitemapUrl; + } + + public String getSitemapURLKey() { + return sitemapURLKey; + } + + private String getId(String pageUrl) { + String[] parts = pageUrl.split("/"); + return parts[parts.length - 1]; + } + + +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java new file mode 100644 index 00000000..e711d745 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/CrawlRecord.java @@ -0,0 +1,136 @@ +package eu.dnetlib.bmuse_webapp.crawl; + +import java.util.Date; + +import hwu.elixir.utils.Validation; + + +/** + * + * Store the current status of a single URL in the scrape service. + * + * + */ + + +public class CrawlRecord { + + private Long id; + + private String context = ""; + + private String url; + + private Date dateScraped; + + private StatusOfScrape status; + + private boolean beingScraped; + + private String name; + + private String nquads; + + public CrawlRecord() { + status = StatusOfScrape.UNTRIED; + } + + public CrawlRecord(String url) { + Validation validation = new Validation(); + if(validation.validateURI(url)) { + this.url = url; + context = ""; + status = StatusOfScrape.UNTRIED; + dateScraped = null; + } else { + throw new IllegalArgumentException(url +" is not a valid url"); + } + this.setId(System.currentTimeMillis()); + } + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public String getUrl() { + return url; + } + + public Date getDateScraped() { + return dateScraped; + } + + public void setDateScraped(Date dateScraped) { + this.dateScraped = dateScraped; + } + + public StatusOfScrape getStatus() { + return status; + } + + public void setStatus(StatusOfScrape status) { + this.status = status; + } + + public String getContext() { + return context; + } + + public void setContext(String context) { + this.context = context; + } + + public boolean isBeingScraped() { + return beingScraped; + } + + public void setBeingScraped(boolean beingScraped) { + this.beingScraped = beingScraped; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getNquads() { + return nquads; + } + + public void setNquads(String nquads) { + this.nquads = nquads; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (!(o instanceof CrawlRecord)) + return false; + + CrawlRecord otherCrawl = (CrawlRecord) o; + + if(this.url.equals(otherCrawl.getUrl())) { + return true; + } + + return false; + } + + @Override + public int hashCode() { + int result = getId() != null ? getId().hashCode() : 0; + result = 31 * result + (getUrl() != null ? getUrl().hashCode() : 0); + result = 31 * result + (getContext() != null ? getContext().hashCode() : 0); + result = 31 * result + (getDateScraped() != null ? getDateScraped().hashCode() : 0); + return result; + } + +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java new file mode 100644 index 00000000..9ecd7ba1 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/crawl/StatusOfScrape.java @@ -0,0 +1,19 @@ +package eu.dnetlib.bmuse_webapp.crawl; + +/** + * + * {@link eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape} describes the possible status levels the scrape for each URL/CrawlRecord. + * + * Each URL/CrawlRecord can have one of the following: + * DOES_NOT_EXIST = 404. + * HUMAN_INSPECTION = cannot parse for some reason; a human should see what is happening. + * UNTRIED = not scraped yet. + * FAILED = one failed attempt at scraping; will try again. + * GIVEN_UP = two failed attempts at scraping. Will not try again. + * SUCCESS = successfully scraped. + * + */ + +public enum StatusOfScrape { + DOES_NOT_EXIST, HUMAN_INSPECTION, UNTRIED, FAILED, GIVEN_UP, SUCCESS; +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java new file mode 100644 index 00000000..22beeb97 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java @@ -0,0 +1,65 @@ +package eu.dnetlib.bmuse_webapp.publisher; + +import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver; +import eu.dnetlib.common.controller.AbstractDnetController; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.LineIterator; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tomcat.jni.FileInfo; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.*; + +import javax.servlet.http.HttpServletResponse; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.List; + +/** + * @author enrico.ottonello + * + */ + +@RestController +@RequestMapping("/api") +public class BMUSEWebappController extends AbstractDnetController { + + private static final Log log = LogFactory.getLog(BMUSEWebappController.class); + + @RequestMapping(value = "/version", method = RequestMethod.GET) + public String version() throws BMUSEWebappException { + return "1.0.0-SNAPSHOT"; + } + + @RequestMapping(value = "/scrape", method = RequestMethod.GET) + public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException { + + log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl); +// String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz +// String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz +// String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz + String sitemapUrlKey = "loc"; + String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt"); + ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename); + service.start(); + return "started"; + } + + @RequestMapping(value = "/nquads", method = RequestMethod.GET) + public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException { + LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8"); + try { + while (it.hasNext()) { + String line = it.nextLine(); + response.getOutputStream().write(line.getBytes(StandardCharsets.UTF_8)); + response.getOutputStream().println(); + } + } finally { + } + return ""; + } +} \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java new file mode 100644 index 00000000..9687ebfa --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappException.java @@ -0,0 +1,28 @@ +package eu.dnetlib.bmuse_webapp.publisher; + +/** + * @author enrico.ottonello + * + */ + +public class BMUSEWebappException extends Exception{ + + public BMUSEWebappException() { + } + + public BMUSEWebappException(final String message) { + super(message); + } + + public BMUSEWebappException(final String message, final Throwable cause) { + super(message, cause); + } + + public BMUSEWebappException(final Throwable cause) { + super(cause); + } + + public BMUSEWebappException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java new file mode 100644 index 00000000..ab7143f2 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/HomeController.java @@ -0,0 +1,17 @@ +package eu.dnetlib.bmuse_webapp.publisher; + +import eu.dnetlib.common.controller.AbstractDnetController; +import org.springframework.stereotype.Controller; +import org.springframework.web.bind.annotation.GetMapping; + +@Controller +public class HomeController extends AbstractDnetController { + + @GetMapping({ + "/doc", "/swagger" + }) + public String apiDoc() { + return "redirect:swagger-ui/"; + } + +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java new file mode 100644 index 00000000..4203f18a --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java @@ -0,0 +1,90 @@ + +package eu.dnetlib.bmuse_webapp.scraper; + +import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver; +import hwu.elixir.scrape.exceptions.MissingMarkupException; +import hwu.elixir.scrape.scraper.ScraperFilteredCore; +import org.apache.any23.Any23; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.source.DocumentSource; +import org.apache.any23.source.StringDocumentSource; +import org.apache.any23.writer.NTriplesWriter; +import org.apache.any23.writer.TripleHandler; +import org.apache.any23.writer.TripleHandlerException; +import org.apache.commons.io.output.ByteArrayOutputStream; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Model; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.Rio; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.StringWriter; + +public class BMUSEScraper extends ScraperFilteredCore { + + private static final Log logger = LogFactory.getLog(BMUSEScraper.class); + + public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception { + logger.debug(url + " > scraping"); + url = fixURL(url); + + String html = ""; + // The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information + // (dynamic and static respectively) + + if (dynamic) { + html = wrapHTMLExtraction(url); + } else { + html = wrapHTMLExtractionStatic(url); + } + + if (html == null || html.contentEquals("")) + throw new Exception("empty html"); + + html = injectId(html, url); + + logger.debug(url + " > html scraped from " + url); + DocumentSource source = new StringDocumentSource(html, url); + String n3 = html2Triples(source, url); + if (n3 == null) { + throw new MissingMarkupException(url); + } + + logger.debug(url + " > processing triples"); + IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI()); + Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l); + if (updatedModel == null) { + throw new Exception("rdf model null"); + } + + logger.debug(url + " > generating nquads"); + try (StringWriter jsonLDWriter = new StringWriter()) { + Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS); + logger.debug(url + " > nquads generated"); + return jsonLDWriter.toString(); + } catch (Exception e) { + throw e; + } + } + + private String html2Triples(DocumentSource source, String url) throws Exception { + Any23 runner = new Any23(); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + TripleHandler handler = new NTriplesWriter(out);) { + runner.extract(source, handler); + return out.toString("UTF-8"); + } catch (ExtractionException e) { + logger.error("Cannot extract triples", e); + } catch (IOException e1) { + logger.error(" IO error whilst extracting triples", e1); + } catch (TripleHandlerException e2) { + logger.error("TripleHanderException", e2); + } + return null; + } +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java new file mode 100644 index 00000000..11aababe --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeState.java @@ -0,0 +1,157 @@ +package eu.dnetlib.bmuse_webapp.scraper; + +import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord; +import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + + */ +public class ScrapeState { + + private List urlsToScrape = Collections.synchronizedList(new ArrayList()); + private List urlsProcessed = Collections.synchronizedList(new ArrayList()); // should this be a set? + private Map nquadsConcurrentHashMap = new ConcurrentHashMap<>(); + + /** + * + * @param pagesToBeScraped The list of sites to be scraped + * @see ScrapeThread + * @see CrawlRecord + */ + public ScrapeState(List pagesToBeScraped) { + urlsToScrape.addAll(pagesToBeScraped); + } + + /** + * Any pages/URLs left to scrape? + * @return True for yes & false for no + * @see CrawlRecord + */ + public synchronized boolean pagesLeftToScrape() { + return !urlsToScrape.isEmpty(); + } + + /** + * Returns the next URL/CrawlRecord to be scraped + * + * @return First page/URL that needs to be scraped next + * @see CrawlRecord + */ + public synchronized CrawlRecord getURLToProcess() { + if (urlsToScrape.isEmpty()) + return null; + + return urlsToScrape.remove(0); + } + + /** + * Adds the given CrawlRecord to the list of CrawlRecords successfully scraped. + * Updates the status of the CrawlRecord to SUCCESS. + * + * @param url The latest URL/page that has been successfully scraped + * @see CrawlRecord + */ + public synchronized void addSuccessfulScrapedURL(CrawlRecord record) { + record.setStatus(StatusOfScrape.SUCCESS); + urlsProcessed.add(record); + } + + /** + * Adds the given CrawlRecord to the list of CrawlRecords NOT successfully scraped. + * Updates the status of the CrawlRecord; if first failure the status is FAILED. + * If status is already FAILED it is changed to GIVEN_UP. + * + * If the status is FAILED, another try will be made in a future run. + * + * + * @param url The latest URL/page that has been unsuccessfully scraped + * @see CrawlRecord + */ + public synchronized void addFailedToScrapeURL(CrawlRecord record) { + if (record.getStatus().equals(StatusOfScrape.FAILED)) { + record.setStatus(StatusOfScrape.GIVEN_UP); + } else { + record.setStatus(StatusOfScrape.FAILED); + } + urlsProcessed.add(record); + } + + /** + * Changes the status of the CrawlRecord to DOES_NOT_EXIST. + * As Selenium does not return the HTTP codes, it is questionable + * how useful this is. + * + * + * @param url The latest URL/page that has been 404'd + * @see CrawlRecord + */ + public synchronized void setStatusTo404(CrawlRecord record) { + record.setStatus(StatusOfScrape.DOES_NOT_EXIST); + urlsProcessed.add(record); + } + + + /** + * + * Changes the status of the CrawlRecord to HUMAN_INSPECTION. + * This captures the idea that the URLs may contain unexpected markup that needs a human to + * review and possibly update the scraper. + * + * @param url The latest URL/page that needs human inspection + * @see CrawlRecord + */ + public synchronized void setStatusToHumanInspection(CrawlRecord record) { + record.setStatus(StatusOfScrape.HUMAN_INSPECTION); + urlsProcessed.add(record); + } + + + /** + * Returns the number of URLs that are still to be scraped in this cycle. + * This does not return the number of URLs left to scrape in the DBMS, just in the current cycle. + * + * @return Number of URLs left to scrape in this cycle + * @see CrawlRecord + */ + public synchronized int getNumberPagesLeftToScrape() { + return urlsToScrape.size(); + } + + /** + * Gets the full list of URLs that have been processed in this cycle. + * This does not return the number of URLs that have been scraped in total across all cycles. + * + * @return + * @see CrawlRecord + */ + public synchronized List getPagesProcessed() { + return urlsProcessed; + } + + /** + * Gets the full list of URLs/CrawlRecords regardless of whether scraped or not in the current cycle. + * + * @return List of all CrawlRecords in this cycle. + * @see CrawlRecord + */ + public synchronized List getPagesProcessedAndUnprocessed() { + List urlsCombined = Collections.synchronizedList(new ArrayList()); + urlsCombined.addAll(urlsProcessed); + urlsCombined.addAll(urlsToScrape); + return urlsCombined; + } + + public void addNquads(String key, String nquads) { + nquadsConcurrentHashMap.putIfAbsent(key, nquads); + } + + public Map getNquadsConcurrentHashMap() { + return nquadsConcurrentHashMap; + } +} \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java new file mode 100644 index 00000000..7fe1ef88 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java @@ -0,0 +1,109 @@ +package eu.dnetlib.bmuse_webapp.scraper; + +import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord; +import eu.dnetlib.bmuse_webapp.utils.CompressorUtil; +import hwu.elixir.scrape.exceptions.CannotWriteException; +import hwu.elixir.scrape.exceptions.FourZeroFourException; +import hwu.elixir.scrape.exceptions.JsonLDInspectionException; +import hwu.elixir.scrape.exceptions.MissingMarkupException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +//import org.apache.commons.logging.Log; +//import org.apache.commons.logging.LogFactory; +//import org.slf4j.Logger; +//import org.slf4j.LoggerFactory; + +import java.util.Date; + +/** + * + * @see BMUSEScraper + * @see ScrapeState + * + */ +public class ScrapeThread extends Thread { + private ScrapeState scrapeState; + private BMUSEScraper process; + private int waitTime; + private boolean fileWritten = true; + private int scrapeVersion = 1; + + private static final Log logger = LogFactory.getLog(ScrapeThread.class); + + /** + * Sets up a thread for actually scrapping. + * + * @param scraper Scraper that will actually do the scraping. + * @param scrapeState Object that maintains state across threads. + * @param waitTime How long (in seconds) thread should wait after scraping + * page before attempting new page. + * @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled. + * + */ + public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) { + this.scrapeState = scrapeState; + process = scraper; + this.waitTime = waitTime; + this.scrapeVersion = contextVersion; + } + + @Override + /** + * Defines high-level process of scraping. Actual scraping done by an + * implementation of Scraper. If page scrape successful will add url to + * Scrape.sitesScraped + * + * @see Scraper + * @see SimpleScraper + */ + public void run() { + while (scrapeState.pagesLeftToScrape()) { + CrawlRecord record = scrapeState.getURLToProcess(); + + if (record == null) + break; + + record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId()); + record.setDateScraped(new Date()); + + try { + String nquads = process.getNQUADSFromUrl(record.getUrl(), true); +// scrapeState.addNquads(record.getName(), nquads); + logger.info("downloaded "+record.getUrl()); + record.setNquads(CompressorUtil.compressValue(nquads)); + if (!nquads.isEmpty()) { + scrapeState.addSuccessfulScrapedURL(record); + } else { + scrapeState.addFailedToScrapeURL(record); + } + } catch(FourZeroFourException fourZeroFourException) { + scrapeState.setStatusTo404(record); + fileWritten = false; + } catch (JsonLDInspectionException je) { + scrapeState.setStatusToHumanInspection(record); + fileWritten = false; + } catch (CannotWriteException cannotWrite) { + logger.error("Caught cannot read file, setting worked to false!"); + fileWritten = false; + scrapeState.addFailedToScrapeURL(record); + return; // no point in continuing + } catch (MissingMarkupException e) { + logger.error("Cannot obtain markup from " + record.getUrl() +"."); + fileWritten = false; + scrapeState.addFailedToScrapeURL(record); + } catch (Exception e) { + e.printStackTrace(); + } + try { + ScrapeThread.sleep(100 * waitTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + process.shutdown(); + } + + public boolean isFileWritten() { + return fileWritten; + } +} \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java new file mode 100644 index 00000000..b13e9cb6 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java @@ -0,0 +1,72 @@ +package eu.dnetlib.bmuse_webapp.scraper; + +import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape; +import hwu.elixir.scrape.exceptions.*; +import hwu.elixir.scrape.scraper.ScraperFilteredCore; +import org.apache.commons.lang.time.DateUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Provides the + * actual scraping functionality. + * + * Scrapes a given URL, converts into NQuads and writes to a file (name derived + * from URL). If the file already exists it will be overwritten. + * + * + * @see ScraperFilteredCore + * + */ +public class ServiceScraper extends ScraperFilteredCore { + + private static Logger logger = LoggerFactory.getLogger(System.class.getName()); + + private StatusOfScrape status= null; + + + /** + * Orchestrates the process of scraping a site before converting the extracted + * triples to NQuads and writing to a file. + * + * @param url Site to be scraped + * @param contextCounter Number used to generate the named graph/context and + * the URLs used to replace blank nodes. + * @param outputFolderName Location to which the NQuads will be written + * @return True if success; false otherwise + * @throws FourZeroFourException + * @throws JsonLDInspectionException + * @throws CannotWriteException + * @throws MissingMarkupException + * + */ + public boolean scrape(String url, Long contextCounter, String outputFolderName, String fileName, StatusOfScrape status) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException { + this.status = status; + logger.info("scraping "+url + " to "+fileName); + return scrape(url, outputFolderName, fileName, contextCounter, true); + } + + + + @Override + /* Now takes account of StateOfCrawl + */ + protected String wrapHTMLExtraction(String url) throws FourZeroFourException { + String html = ""; + if (status.equals(StatusOfScrape.UNTRIED) || status.equals(StatusOfScrape.FAILED)) { + try { + html = getHtmlViaSelenium(url); + } catch (SeleniumException e) { + // try again + try { + html = getHtmlViaSelenium(url); + } catch (SeleniumException e2) { + return ""; + } + } + } else { + return ""; + } + return html; + } +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java new file mode 100644 index 00000000..cab5102a --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/CompressorUtil.java @@ -0,0 +1,34 @@ +package eu.dnetlib.bmuse_webapp.utils; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.io.IOUtils; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.StringWriter; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public class CompressorUtil { + + public static String decompressValue(final String abstractCompressed) { + try { + byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); + GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); + final StringWriter stringWriter = new StringWriter(); + IOUtils.copy(gis, stringWriter); + return stringWriter.toString(); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + + public static String compressValue(final String value) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + GZIPOutputStream gzip = new GZIPOutputStream(out); + gzip.write(value.getBytes()); + gzip.close(); + return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); + } +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java new file mode 100644 index 00000000..39c64791 --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/utils/UrlParser.java @@ -0,0 +1,64 @@ + +package eu.dnetlib.bmuse_webapp.utils; + +import hwu.elixir.utils.Helpers; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +public class UrlParser { + + private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName()); + + public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException { + + Document doc = new Document(url); + Document urlSitemapListsNested; + Elements elements = new Elements(); + Elements sitemaps = new Elements(); + boolean sitemapindex = false; + boolean urlset = false; + + try { + int urlLength = url.length(); + logger.info("parse sitemap list"); + String sitemapExt = url.substring(urlLength - 3, urlLength); + if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending + logger.info("compressed sitemap"); + byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes(); + doc = Helpers.gzipFileDecompression(bytes); + } else { + doc = Jsoup.connect(url).maxBodySize(0).get(); + } + + } catch (IOException e) { + logger.error("Jsoup parsing exception: " + e.getMessage()); + } + + try { + + elements = doc.select(sitemapURLKey); + + // check the html if it is a sitemapindex or a urlset + sitemapindex = doc.outerHtml().contains("sitemapindex"); + urlset = doc.outerHtml().contains("urlset"); + } catch (NullPointerException e) { + logger.error(e.getMessage()); + } + + if (sitemapindex) { + // if sitemapindex get the loc of all the sitemaps + // added warning for sitemap index files + logger + .warn( + "please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead"); + sitemaps = doc.select(sitemapURLKey); + } + + return elements; + } +} diff --git a/apps/bioschemas-api/src/main/resources/application.properties b/apps/bioschemas-api/src/main/resources/application.properties new file mode 100644 index 00000000..ce5f349b --- /dev/null +++ b/apps/bioschemas-api/src/main/resources/application.properties @@ -0,0 +1,24 @@ +server.servlet.context-path=/dnet-bmuse-webapp +server.port=8281 + +spring.profiles.active=garr + +logging.file.name = /var/log/springboot/9480/oa_organizations.log + +maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml + +spring.main.banner-mode = off + +logging.level.root = INFO + +management.endpoints.web.exposure.include = prometheus,health +management.endpoints.web.base-path = / +management.endpoints.web.path-mapping.prometheus = metrics +management.endpoints.web.path-mapping.health = health + +waitTime=5 +outputFolder=/Users/enrico.ottonello/data/bmuse-output +numberOfPagesToCrawlInALoop=8 +totalNumberOfPagesToCrawlInASession=32 +chromiumDriverLocation = /usr/local/bin/chromedriver +scrapeVersion=1 \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/resources/logback-spring.xml b/apps/bioschemas-api/src/main/resources/logback-spring.xml new file mode 100644 index 00000000..3c5e86fe --- /dev/null +++ b/apps/bioschemas-api/src/main/resources/logback-spring.xml @@ -0,0 +1,30 @@ + + + + /var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log + + %d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n + + + /var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log + 10MB + 10 + 100MB + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/apps/pom.xml b/apps/pom.xml index 363953d6..4e36cc99 100644 --- a/apps/pom.xml +++ b/apps/pom.xml @@ -18,6 +18,7 @@ dnet-orgs-database-application dnet-exporter-api scholexplorer-api + bioschemas-api diff --git a/pom.xml b/pom.xml index 413986b3..ea4c6e21 100644 --- a/pom.xml +++ b/pom.xml @@ -88,6 +88,18 @@ Cloudera Repository https://repository.cloudera.com/artifactory/cloudera-repos + + dnet-deps + D-Net Dependencies + https://maven.d4science.org/nexus/content/repositories/dnet-deps/ + + true + + + false + + default +