implemented syncronus scraping call handling
This commit is contained in:
parent
8a18fe11ec
commit
852ff05881
|
@ -0,0 +1,8 @@
|
||||||
|
https://mobidb.org/sitemap2.xml.gz
|
||||||
|
scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
|
||||||
|
|
||||||
|
https://proteinensemble.org/sitemap2.xml.gz
|
||||||
|
scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
|
||||||
|
|
||||||
|
https://disprot.org/sitemap2.xml.gz
|
||||||
|
scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
|
|
@ -1,17 +1,20 @@
|
||||||
package eu.dnetlib.bioschemas.api.publisher;
|
package eu.dnetlib.bioschemas.api.publisher;
|
||||||
|
|
||||||
import eu.dnetlib.bioschemas.api.ServiceScrapeDriver;
|
import eu.dnetlib.bioschemas.api.ServiceScrapeDriver;
|
||||||
|
import eu.dnetlib.bioschemas.api.scraper.ScrapingExecution;
|
||||||
|
import eu.dnetlib.bioschemas.api.scraper.ScrapingExecutor;
|
||||||
import eu.dnetlib.common.controller.AbstractDnetController;
|
import eu.dnetlib.common.controller.AbstractDnetController;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.LineIterator;
|
import org.apache.commons.io.LineIterator;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
import org.springframework.web.bind.annotation.RequestMapping;
|
import org.springframework.web.bind.annotation.*;
|
||||||
import org.springframework.web.bind.annotation.RequestMethod;
|
|
||||||
import org.springframework.web.bind.annotation.RequestParam;
|
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
|
||||||
|
|
||||||
|
import javax.servlet.http.HttpServletRequest;
|
||||||
import javax.servlet.http.HttpServletResponse;
|
import javax.servlet.http.HttpServletResponse;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -33,16 +36,20 @@ public class BioschemasAPIController extends AbstractDnetController {
|
||||||
|
|
||||||
private static Logger logger = LoggerFactory.getLogger(BioschemasAPIController.class);
|
private static Logger logger = LoggerFactory.getLogger(BioschemasAPIController.class);
|
||||||
|
|
||||||
@RequestMapping(value = "/startScraping", method = RequestMethod.GET)
|
@Autowired
|
||||||
public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BioschemasException, IOException {
|
private ScrapingExecutor scrapingExecutor;
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(BioschemasAPIController.class);
|
||||||
|
|
||||||
|
@GetMapping("/startScraping")
|
||||||
|
public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) {
|
||||||
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
||||||
|
return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr());
|
||||||
|
}
|
||||||
|
|
||||||
String sitemapUrlKey = "loc";
|
@GetMapping("/startScraping/status")
|
||||||
String outputFilename = datasourceKey.concat(getOutputDataPattern());
|
public final ScrapingExecution statusScraping() {
|
||||||
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
return scrapingExecutor.getLastScrapingExecution();
|
||||||
service.start();
|
|
||||||
return "started";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@RequestMapping(value = "/getNQuads", method = RequestMethod.GET)
|
@RequestMapping(value = "/getNQuads", method = RequestMethod.GET)
|
||||||
|
|
|
@ -0,0 +1,99 @@
|
||||||
|
package eu.dnetlib.bioschemas.api.scraper;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
public class ScrapingExecution {
|
||||||
|
|
||||||
|
private String id;
|
||||||
|
private Long dateStart;
|
||||||
|
private Long dateEnd;
|
||||||
|
private ScrapingStatus status = ScrapingStatus.NOT_YET_STARTED;
|
||||||
|
private String message;
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(ScrapingExecution.class);
|
||||||
|
|
||||||
|
public ScrapingExecution() {}
|
||||||
|
|
||||||
|
public ScrapingExecution(final String id, final Long dateStart, final Long dateEnd, final ScrapingStatus status, final String message) {
|
||||||
|
this.id = id;
|
||||||
|
this.dateStart = dateStart;
|
||||||
|
this.dateEnd = dateEnd;
|
||||||
|
this.status = status;
|
||||||
|
this.message = message;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(final String id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long getDateStart() {
|
||||||
|
return dateStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDateStart(final Long dateStart) {
|
||||||
|
this.dateStart = dateStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long getDateEnd() {
|
||||||
|
return dateEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDateEnd(final Long dateEnd) {
|
||||||
|
this.dateEnd = dateEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ScrapingStatus getStatus() {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStatus(final ScrapingStatus status) {
|
||||||
|
this.status = status;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getMessage() {
|
||||||
|
return message;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMessage(final String message) {
|
||||||
|
this.message = message;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void startNew(final String message) {
|
||||||
|
setId("scraping-" + UUID.randomUUID());
|
||||||
|
setDateStart(System.currentTimeMillis());
|
||||||
|
setDateEnd(null);
|
||||||
|
setStatus(ScrapingStatus.RUNNING);
|
||||||
|
setMessage(message);
|
||||||
|
log.info(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void complete() {
|
||||||
|
setDateEnd(System.currentTimeMillis());
|
||||||
|
setStatus(ScrapingStatus.SUCCESS);
|
||||||
|
|
||||||
|
final long millis = getDateEnd() - getDateStart();
|
||||||
|
setMessage(String
|
||||||
|
.format("Scraping completed in %d min, %d sec", TimeUnit.MILLISECONDS.toMinutes(millis), TimeUnit.MILLISECONDS.toSeconds(millis) -
|
||||||
|
TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))));
|
||||||
|
|
||||||
|
log.info(getMessage());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void fail(final Throwable e) {
|
||||||
|
setDateEnd(new Date().getTime());
|
||||||
|
setStatus(ScrapingStatus.FAILED);
|
||||||
|
setMessage(e.getMessage());
|
||||||
|
log.error("Error scraping", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
package eu.dnetlib.bioschemas.api.scraper;
|
||||||
|
|
||||||
|
import eu.dnetlib.bioschemas.api.ServiceScrapeDriver;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import javax.servlet.http.HttpServletRequest;
|
||||||
|
|
||||||
|
@Component
|
||||||
|
public class ScrapingExecutor {
|
||||||
|
|
||||||
|
private final ScrapingExecution lastScrapingExecution = new ScrapingExecution();
|
||||||
|
|
||||||
|
public ScrapingExecution getLastScrapingExecution() {
|
||||||
|
return lastScrapingExecution;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr) {
|
||||||
|
synchronized (lastScrapingExecution) {
|
||||||
|
if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) {
|
||||||
|
lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr);
|
||||||
|
new Thread(() -> {
|
||||||
|
try {
|
||||||
|
String sitemapUrlKey = "loc";
|
||||||
|
String outputFilename = datasourceKey.concat(outputDataPattern);
|
||||||
|
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
||||||
|
service.start();
|
||||||
|
lastScrapingExecution.complete();
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
lastScrapingExecution.fail(e);
|
||||||
|
}
|
||||||
|
}).start();
|
||||||
|
} else {
|
||||||
|
final long now = System.currentTimeMillis();
|
||||||
|
return new ScrapingExecution(null, now, now, ScrapingStatus.NOT_LAUNCHED, "An other scraping is running");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return lastScrapingExecution;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,9 @@
|
||||||
|
package eu.dnetlib.bioschemas.api.scraper;
|
||||||
|
|
||||||
|
public enum ScrapingStatus {
|
||||||
|
SUCCESS,
|
||||||
|
FAILED,
|
||||||
|
RUNNING,
|
||||||
|
NOT_LAUNCHED,
|
||||||
|
NOT_YET_STARTED
|
||||||
|
}
|
Loading…
Reference in New Issue