Compare commits
2 Commits
7511ef94c2
...
3f61e6fce2
Author | SHA1 | Date |
---|---|---|
Enrico Ottonello | 3f61e6fce2 | |
Enrico Ottonello | a70a327281 |
|
@ -44,19 +44,12 @@ public class ServiceScrapeDriver {
|
||||||
|
|
||||||
private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
|
private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
|
||||||
|
|
||||||
public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) {
|
public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename, String outputFolder) {
|
||||||
this.sitemapUrl = sitemapUrl;
|
this.sitemapUrl = sitemapUrl;
|
||||||
this.sitemapURLKey = sitemapURLKey;
|
this.sitemapURLKey = sitemapURLKey;
|
||||||
this.maxScrapedPages = maxScrapedPages;
|
this.maxScrapedPages = maxScrapedPages;
|
||||||
this.outputFilename = outputFilename;
|
this.outputFilename = outputFilename;
|
||||||
}
|
this.outputFolder = outputFolder;
|
||||||
|
|
||||||
/**
|
|
||||||
* Runs the scrape process
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public void start() throws IOException {
|
|
||||||
runScrape();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -66,7 +59,7 @@ public class ServiceScrapeDriver {
|
||||||
* as been left in situ in case it is useful in the future.
|
* as been left in situ in case it is useful in the future.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
private void runScrape() throws IOException {
|
public void runScrape() throws IOException {
|
||||||
processProperties();
|
processProperties();
|
||||||
String url = sitemapUrl.toLowerCase();
|
String url = sitemapUrl.toLowerCase();
|
||||||
Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
|
Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
|
||||||
|
@ -189,8 +182,6 @@ public class ServiceScrapeDriver {
|
||||||
|
|
||||||
waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
|
waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
|
||||||
logger.info(" waitTime: " + waitTime);
|
logger.info(" waitTime: " + waitTime);
|
||||||
outputFolder = prop.getProperty("outputFolder").trim();
|
|
||||||
logger.info(" outputFolder: " + outputFolder);
|
|
||||||
numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
|
numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
|
||||||
logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
|
logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
|
||||||
totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());
|
totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class BioschemasAPIController extends AbstractDnetController {
|
||||||
@GetMapping("/startScraping")
|
@GetMapping("/startScraping")
|
||||||
public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) {
|
public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) {
|
||||||
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
||||||
return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr());
|
return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr(), getOutputFolder());
|
||||||
}
|
}
|
||||||
|
|
||||||
@GetMapping("/startScraping/status")
|
@GetMapping("/startScraping/status")
|
||||||
|
@ -76,4 +76,12 @@ public class BioschemasAPIController extends AbstractDnetController {
|
||||||
public String getOutputDataPattern() {
|
public String getOutputDataPattern() {
|
||||||
return outputDataPattern;
|
return outputDataPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setOutputFolder(String outputFolder) {
|
||||||
|
this.outputFolder = outputFolder;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOutputDataPattern(String outputDataPattern) {
|
||||||
|
this.outputDataPattern = outputDataPattern;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -14,7 +14,7 @@ public class ScrapingExecutor {
|
||||||
return lastScrapingExecution;
|
return lastScrapingExecution;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr) {
|
public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr, final String outputFolder) {
|
||||||
synchronized (lastScrapingExecution) {
|
synchronized (lastScrapingExecution) {
|
||||||
if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) {
|
if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) {
|
||||||
lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr);
|
lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr);
|
||||||
|
@ -22,8 +22,8 @@ public class ScrapingExecutor {
|
||||||
try {
|
try {
|
||||||
String sitemapUrlKey = "loc";
|
String sitemapUrlKey = "loc";
|
||||||
String outputFilename = datasourceKey.concat(outputDataPattern);
|
String outputFilename = datasourceKey.concat(outputDataPattern);
|
||||||
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename, outputFolder);
|
||||||
service.start();
|
service.runScrape();
|
||||||
lastScrapingExecution.complete();
|
lastScrapingExecution.complete();
|
||||||
} catch (final Throwable e) {
|
} catch (final Throwable e) {
|
||||||
lastScrapingExecution.fail(e);
|
lastScrapingExecution.fail(e);
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
server.servlet.context-path=/bioschemas
|
server.servlet.context-path=/bioschemas-api
|
||||||
server.port=8281
|
server.port=8281
|
||||||
|
|
||||||
spring.profiles.active=garr
|
spring.profiles.active=garr
|
||||||
|
|
||||||
logging.file.name = /var/log/bioschemas/log/bioschemas-api.log
|
logging.file.name = /var/log/bioschemas-api/bioschemas.log
|
||||||
|
|
||||||
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
|
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
|
||||||
|
|
||||||
|
@ -17,9 +17,8 @@ management.endpoints.web.path-mapping.prometheus = metrics
|
||||||
management.endpoints.web.path-mapping.health = health
|
management.endpoints.web.path-mapping.health = health
|
||||||
|
|
||||||
waitTime=5
|
waitTime=5
|
||||||
outputFolder=/data
|
outputFolder=/data/bioschemas-harvest
|
||||||
outputDataPattern=_base64_gzipped_nquads.txt
|
outputDataPattern=_base64_gzipped_nquads.txt
|
||||||
numberOfPagesToCrawlInALoop=8
|
numberOfPagesToCrawlInALoop=8
|
||||||
totalNumberOfPagesToCrawlInASession=32
|
totalNumberOfPagesToCrawlInASession=32
|
||||||
chromiumDriverLocation = /usr/local/bin/chromedriver
|
|
||||||
scrapeVersion=1
|
scrapeVersion=1
|
Loading…
Reference in New Issue