implemented syncronus scraping call handling

This commit is contained in:
Enrico Ottonello 2022-06-07 15:12:46 +02:00
parent 8a18fe11ec
commit 852ff05881
5 changed files with 174 additions and 11 deletions

View File

@ -0,0 +1,8 @@
https://mobidb.org/sitemap2.xml.gz
scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
https://proteinensemble.org/sitemap2.xml.gz
scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
https://disprot.org/sitemap2.xml.gz
scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz

View File

@ -1,17 +1,20 @@
package eu.dnetlib.bioschemas.api.publisher; package eu.dnetlib.bioschemas.api.publisher;
import eu.dnetlib.bioschemas.api.ServiceScrapeDriver; import eu.dnetlib.bioschemas.api.ServiceScrapeDriver;
import eu.dnetlib.bioschemas.api.scraper.ScrapingExecution;
import eu.dnetlib.bioschemas.api.scraper.ScrapingExecutor;
import eu.dnetlib.common.controller.AbstractDnetController; import eu.dnetlib.common.controller.AbstractDnetController;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator; import org.apache.commons.io.LineIterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.*;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpServletResponse;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -33,16 +36,20 @@ public class BioschemasAPIController extends AbstractDnetController {
private static Logger logger = LoggerFactory.getLogger(BioschemasAPIController.class); private static Logger logger = LoggerFactory.getLogger(BioschemasAPIController.class);
@RequestMapping(value = "/startScraping", method = RequestMethod.GET) @Autowired
public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BioschemasException, IOException { private ScrapingExecutor scrapingExecutor;
private static final Log log = LogFactory.getLog(BioschemasAPIController.class);
@GetMapping("/startScraping")
public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) {
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl); logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr());
}
String sitemapUrlKey = "loc"; @GetMapping("/startScraping/status")
String outputFilename = datasourceKey.concat(getOutputDataPattern()); public final ScrapingExecution statusScraping() {
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename); return scrapingExecutor.getLastScrapingExecution();
service.start();
return "started";
} }
@RequestMapping(value = "/getNQuads", method = RequestMethod.GET) @RequestMapping(value = "/getNQuads", method = RequestMethod.GET)

View File

@ -0,0 +1,99 @@
package eu.dnetlib.bioschemas.api.scraper;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Date;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
public class ScrapingExecution {
private String id;
private Long dateStart;
private Long dateEnd;
private ScrapingStatus status = ScrapingStatus.NOT_YET_STARTED;
private String message;
private static final Log log = LogFactory.getLog(ScrapingExecution.class);
public ScrapingExecution() {}
public ScrapingExecution(final String id, final Long dateStart, final Long dateEnd, final ScrapingStatus status, final String message) {
this.id = id;
this.dateStart = dateStart;
this.dateEnd = dateEnd;
this.status = status;
this.message = message;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public Long getDateStart() {
return dateStart;
}
public void setDateStart(final Long dateStart) {
this.dateStart = dateStart;
}
public Long getDateEnd() {
return dateEnd;
}
public void setDateEnd(final Long dateEnd) {
this.dateEnd = dateEnd;
}
public ScrapingStatus getStatus() {
return status;
}
public void setStatus(final ScrapingStatus status) {
this.status = status;
}
public String getMessage() {
return message;
}
public void setMessage(final String message) {
this.message = message;
}
public void startNew(final String message) {
setId("scraping-" + UUID.randomUUID());
setDateStart(System.currentTimeMillis());
setDateEnd(null);
setStatus(ScrapingStatus.RUNNING);
setMessage(message);
log.info(message);
}
public void complete() {
setDateEnd(System.currentTimeMillis());
setStatus(ScrapingStatus.SUCCESS);
final long millis = getDateEnd() - getDateStart();
setMessage(String
.format("Scraping completed in %d min, %d sec", TimeUnit.MILLISECONDS.toMinutes(millis), TimeUnit.MILLISECONDS.toSeconds(millis) -
TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))));
log.info(getMessage());
}
public void fail(final Throwable e) {
setDateEnd(new Date().getTime());
setStatus(ScrapingStatus.FAILED);
setMessage(e.getMessage());
log.error("Error scraping", e);
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.bioschemas.api.scraper;
import eu.dnetlib.bioschemas.api.ServiceScrapeDriver;
import org.springframework.stereotype.Component;
import javax.servlet.http.HttpServletRequest;
@Component
public class ScrapingExecutor {
private final ScrapingExecution lastScrapingExecution = new ScrapingExecution();
public ScrapingExecution getLastScrapingExecution() {
return lastScrapingExecution;
}
public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr) {
synchronized (lastScrapingExecution) {
if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) {
lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr);
new Thread(() -> {
try {
String sitemapUrlKey = "loc";
String outputFilename = datasourceKey.concat(outputDataPattern);
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
service.start();
lastScrapingExecution.complete();
} catch (final Throwable e) {
lastScrapingExecution.fail(e);
}
}).start();
} else {
final long now = System.currentTimeMillis();
return new ScrapingExecution(null, now, now, ScrapingStatus.NOT_LAUNCHED, "An other scraping is running");
}
}
return lastScrapingExecution;
}
}

View File

@ -0,0 +1,9 @@
package eu.dnetlib.bioschemas.api.scraper;
public enum ScrapingStatus {
SUCCESS,
FAILED,
RUNNING,
NOT_LAUNCHED,
NOT_YET_STARTED
}