diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/ServiceScrapeDriver.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/ServiceScrapeDriver.java index bbe7adc8..aec1e185 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/ServiceScrapeDriver.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/ServiceScrapeDriver.java @@ -44,19 +44,12 @@ public class ServiceScrapeDriver { private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class); - public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) { + public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename, String outputFolder) { this.sitemapUrl = sitemapUrl; this.sitemapURLKey = sitemapURLKey; this.maxScrapedPages = maxScrapedPages; this.outputFilename = outputFilename; - } - - /** - * Runs the scrape process - * - */ - public void start() throws IOException { - runScrape(); + this.outputFolder = outputFolder; } /** @@ -66,7 +59,7 @@ public class ServiceScrapeDriver { * as been left in situ in case it is useful in the future. * */ - private void runScrape() throws IOException { + public void runScrape() throws IOException { processProperties(); String url = sitemapUrl.toLowerCase(); Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey()); @@ -189,8 +182,6 @@ public class ServiceScrapeDriver { waitTime = Integer.parseInt(prop.getProperty("waitTime").trim()); logger.info(" waitTime: " + waitTime); - outputFolder = prop.getProperty("outputFolder").trim(); - logger.info(" outputFolder: " + outputFolder); numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim()); logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop); totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim()); diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/controller/BioschemasAPIController.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/controller/BioschemasAPIController.java index b810e0ef..e4773f9f 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/controller/BioschemasAPIController.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/controller/BioschemasAPIController.java @@ -44,7 +44,7 @@ public class BioschemasAPIController extends AbstractDnetController { @GetMapping("/startScraping") public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) { logger.info(" datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl); - return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr()); + return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr(), getOutputFolder()); } @GetMapping("/startScraping/status") @@ -76,4 +76,12 @@ public class BioschemasAPIController extends AbstractDnetController { public String getOutputDataPattern() { return outputDataPattern; } + + public void setOutputFolder(String outputFolder) { + this.outputFolder = outputFolder; + } + + public void setOutputDataPattern(String outputDataPattern) { + this.outputDataPattern = outputDataPattern; + } } \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecutor.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecutor.java index e3ccb6ff..341f0d20 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecutor.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecutor.java @@ -14,7 +14,7 @@ public class ScrapingExecutor { return lastScrapingExecution; } - public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr) { + public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr, final String outputFolder) { synchronized (lastScrapingExecution) { if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) { lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr); @@ -22,8 +22,8 @@ public class ScrapingExecutor { try { String sitemapUrlKey = "loc"; String outputFilename = datasourceKey.concat(outputDataPattern); - ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename); - service.start(); + ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename, outputFolder); + service.runScrape(); lastScrapingExecution.complete(); } catch (final Throwable e) { lastScrapingExecution.fail(e); diff --git a/apps/bioschemas-api/src/main/resources/application.properties b/apps/bioschemas-api/src/main/resources/application.properties index 43ac7dad..5e88397b 100644 --- a/apps/bioschemas-api/src/main/resources/application.properties +++ b/apps/bioschemas-api/src/main/resources/application.properties @@ -1,9 +1,9 @@ -server.servlet.context-path=/bioschemas +server.servlet.context-path=/bioschemas-api server.port=8281 spring.profiles.active=garr -logging.file.name = /var/log/bioschemas/log/bioschemas-api.log +logging.file.name = /var/log/bioschemas-api/bioschemas.log maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml @@ -17,9 +17,8 @@ management.endpoints.web.path-mapping.prometheus = metrics management.endpoints.web.path-mapping.health = health waitTime=5 -outputFolder=/data +outputFolder=/data/bioschemas-harvest outputDataPattern=_base64_gzipped_nquads.txt numberOfPagesToCrawlInALoop=8 totalNumberOfPagesToCrawlInASession=32 -chromiumDriverLocation = /usr/local/bin/chromedriver scrapeVersion=1 \ No newline at end of file