Compare commits

...

2 Commits

4 changed files with 18 additions and 20 deletions

View File

@ -44,19 +44,12 @@ public class ServiceScrapeDriver {
private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class); private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) { public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename, String outputFolder) {
this.sitemapUrl = sitemapUrl; this.sitemapUrl = sitemapUrl;
this.sitemapURLKey = sitemapURLKey; this.sitemapURLKey = sitemapURLKey;
this.maxScrapedPages = maxScrapedPages; this.maxScrapedPages = maxScrapedPages;
this.outputFilename = outputFilename; this.outputFilename = outputFilename;
} this.outputFolder = outputFolder;
/**
* Runs the scrape process
*
*/
public void start() throws IOException {
runScrape();
} }
/** /**
@ -66,7 +59,7 @@ public class ServiceScrapeDriver {
* as been left in situ in case it is useful in the future. * as been left in situ in case it is useful in the future.
* *
*/ */
private void runScrape() throws IOException { public void runScrape() throws IOException {
processProperties(); processProperties();
String url = sitemapUrl.toLowerCase(); String url = sitemapUrl.toLowerCase();
Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey()); Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
@ -189,8 +182,6 @@ public class ServiceScrapeDriver {
waitTime = Integer.parseInt(prop.getProperty("waitTime").trim()); waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
logger.info(" waitTime: " + waitTime); logger.info(" waitTime: " + waitTime);
outputFolder = prop.getProperty("outputFolder").trim();
logger.info(" outputFolder: " + outputFolder);
numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim()); numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop); logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim()); totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());

View File

@ -44,7 +44,7 @@ public class BioschemasAPIController extends AbstractDnetController {
@GetMapping("/startScraping") @GetMapping("/startScraping")
public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) { public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) {
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl); logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr()); return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr(), getOutputFolder());
} }
@GetMapping("/startScraping/status") @GetMapping("/startScraping/status")
@ -76,4 +76,12 @@ public class BioschemasAPIController extends AbstractDnetController {
public String getOutputDataPattern() { public String getOutputDataPattern() {
return outputDataPattern; return outputDataPattern;
} }
public void setOutputFolder(String outputFolder) {
this.outputFolder = outputFolder;
}
public void setOutputDataPattern(String outputDataPattern) {
this.outputDataPattern = outputDataPattern;
}
} }

View File

@ -14,7 +14,7 @@ public class ScrapingExecutor {
return lastScrapingExecution; return lastScrapingExecution;
} }
public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr) { public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr, final String outputFolder) {
synchronized (lastScrapingExecution) { synchronized (lastScrapingExecution) {
if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) { if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) {
lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr); lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr);
@ -22,8 +22,8 @@ public class ScrapingExecutor {
try { try {
String sitemapUrlKey = "loc"; String sitemapUrlKey = "loc";
String outputFilename = datasourceKey.concat(outputDataPattern); String outputFilename = datasourceKey.concat(outputDataPattern);
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename); ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename, outputFolder);
service.start(); service.runScrape();
lastScrapingExecution.complete(); lastScrapingExecution.complete();
} catch (final Throwable e) { } catch (final Throwable e) {
lastScrapingExecution.fail(e); lastScrapingExecution.fail(e);

View File

@ -1,9 +1,9 @@
server.servlet.context-path=/bioschemas server.servlet.context-path=/bioschemas-api
server.port=8281 server.port=8281
spring.profiles.active=garr spring.profiles.active=garr
logging.file.name = /var/log/bioschemas/log/bioschemas-api.log logging.file.name = /var/log/bioschemas-api/bioschemas.log
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
@ -17,9 +17,8 @@ management.endpoints.web.path-mapping.prometheus = metrics
management.endpoints.web.path-mapping.health = health management.endpoints.web.path-mapping.health = health
waitTime=5 waitTime=5
outputFolder=/data outputFolder=/data/bioschemas-harvest
outputDataPattern=_base64_gzipped_nquads.txt outputDataPattern=_base64_gzipped_nquads.txt
numberOfPagesToCrawlInALoop=8 numberOfPagesToCrawlInALoop=8
totalNumberOfPagesToCrawlInASession=32 totalNumberOfPagesToCrawlInASession=32
chromiumDriverLocation = /usr/local/bin/chromedriver
scrapeVersion=1 scrapeVersion=1