implemented syncronus scraping call handling
This commit is contained in:
parent
8a18fe11ec
commit
852ff05881
|
@ -0,0 +1,8 @@
|
|||
https://mobidb.org/sitemap2.xml.gz
|
||||
scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
|
||||
|
||||
https://proteinensemble.org/sitemap2.xml.gz
|
||||
scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
|
||||
|
||||
https://disprot.org/sitemap2.xml.gz
|
||||
scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
|
|
@ -1,17 +1,20 @@
|
|||
package eu.dnetlib.bioschemas.api.publisher;
|
||||
|
||||
import eu.dnetlib.bioschemas.api.ServiceScrapeDriver;
|
||||
import eu.dnetlib.bioschemas.api.scraper.ScrapingExecution;
|
||||
import eu.dnetlib.bioschemas.api.scraper.ScrapingExecutor;
|
||||
import eu.dnetlib.common.controller.AbstractDnetController;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.LineIterator;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMethod;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
@ -33,16 +36,20 @@ public class BioschemasAPIController extends AbstractDnetController {
|
|||
|
||||
private static Logger logger = LoggerFactory.getLogger(BioschemasAPIController.class);
|
||||
|
||||
@RequestMapping(value = "/startScraping", method = RequestMethod.GET)
|
||||
public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BioschemasException, IOException {
|
||||
@Autowired
|
||||
private ScrapingExecutor scrapingExecutor;
|
||||
|
||||
private static final Log log = LogFactory.getLog(BioschemasAPIController.class);
|
||||
|
||||
@GetMapping("/startScraping")
|
||||
public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) {
|
||||
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
||||
return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr());
|
||||
}
|
||||
|
||||
String sitemapUrlKey = "loc";
|
||||
String outputFilename = datasourceKey.concat(getOutputDataPattern());
|
||||
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
||||
service.start();
|
||||
return "started";
|
||||
@GetMapping("/startScraping/status")
|
||||
public final ScrapingExecution statusScraping() {
|
||||
return scrapingExecutor.getLastScrapingExecution();
|
||||
}
|
||||
|
||||
@RequestMapping(value = "/getNQuads", method = RequestMethod.GET)
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
package eu.dnetlib.bioschemas.api.scraper;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class ScrapingExecution {
|
||||
|
||||
private String id;
|
||||
private Long dateStart;
|
||||
private Long dateEnd;
|
||||
private ScrapingStatus status = ScrapingStatus.NOT_YET_STARTED;
|
||||
private String message;
|
||||
|
||||
private static final Log log = LogFactory.getLog(ScrapingExecution.class);
|
||||
|
||||
public ScrapingExecution() {}
|
||||
|
||||
public ScrapingExecution(final String id, final Long dateStart, final Long dateEnd, final ScrapingStatus status, final String message) {
|
||||
this.id = id;
|
||||
this.dateStart = dateStart;
|
||||
this.dateEnd = dateEnd;
|
||||
this.status = status;
|
||||
this.message = message;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public Long getDateStart() {
|
||||
return dateStart;
|
||||
}
|
||||
|
||||
public void setDateStart(final Long dateStart) {
|
||||
this.dateStart = dateStart;
|
||||
}
|
||||
|
||||
public Long getDateEnd() {
|
||||
return dateEnd;
|
||||
}
|
||||
|
||||
public void setDateEnd(final Long dateEnd) {
|
||||
this.dateEnd = dateEnd;
|
||||
}
|
||||
|
||||
public ScrapingStatus getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
public void setStatus(final ScrapingStatus status) {
|
||||
this.status = status;
|
||||
}
|
||||
|
||||
public String getMessage() {
|
||||
return message;
|
||||
}
|
||||
|
||||
public void setMessage(final String message) {
|
||||
this.message = message;
|
||||
}
|
||||
|
||||
public void startNew(final String message) {
|
||||
setId("scraping-" + UUID.randomUUID());
|
||||
setDateStart(System.currentTimeMillis());
|
||||
setDateEnd(null);
|
||||
setStatus(ScrapingStatus.RUNNING);
|
||||
setMessage(message);
|
||||
log.info(message);
|
||||
}
|
||||
|
||||
public void complete() {
|
||||
setDateEnd(System.currentTimeMillis());
|
||||
setStatus(ScrapingStatus.SUCCESS);
|
||||
|
||||
final long millis = getDateEnd() - getDateStart();
|
||||
setMessage(String
|
||||
.format("Scraping completed in %d min, %d sec", TimeUnit.MILLISECONDS.toMinutes(millis), TimeUnit.MILLISECONDS.toSeconds(millis) -
|
||||
TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))));
|
||||
|
||||
log.info(getMessage());
|
||||
|
||||
}
|
||||
|
||||
public void fail(final Throwable e) {
|
||||
setDateEnd(new Date().getTime());
|
||||
setStatus(ScrapingStatus.FAILED);
|
||||
setMessage(e.getMessage());
|
||||
log.error("Error scraping", e);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package eu.dnetlib.bioschemas.api.scraper;
|
||||
|
||||
import eu.dnetlib.bioschemas.api.ServiceScrapeDriver;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
|
||||
@Component
|
||||
public class ScrapingExecutor {
|
||||
|
||||
private final ScrapingExecution lastScrapingExecution = new ScrapingExecution();
|
||||
|
||||
public ScrapingExecution getLastScrapingExecution() {
|
||||
return lastScrapingExecution;
|
||||
}
|
||||
|
||||
public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr) {
|
||||
synchronized (lastScrapingExecution) {
|
||||
if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) {
|
||||
lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr);
|
||||
new Thread(() -> {
|
||||
try {
|
||||
String sitemapUrlKey = "loc";
|
||||
String outputFilename = datasourceKey.concat(outputDataPattern);
|
||||
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
||||
service.start();
|
||||
lastScrapingExecution.complete();
|
||||
} catch (final Throwable e) {
|
||||
lastScrapingExecution.fail(e);
|
||||
}
|
||||
}).start();
|
||||
} else {
|
||||
final long now = System.currentTimeMillis();
|
||||
return new ScrapingExecution(null, now, now, ScrapingStatus.NOT_LAUNCHED, "An other scraping is running");
|
||||
}
|
||||
|
||||
}
|
||||
return lastScrapingExecution;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package eu.dnetlib.bioschemas.api.scraper;
|
||||
|
||||
public enum ScrapingStatus {
|
||||
SUCCESS,
|
||||
FAILED,
|
||||
RUNNING,
|
||||
NOT_LAUNCHED,
|
||||
NOT_YET_STARTED
|
||||
}
|
Loading…
Reference in New Issue