diff --git a/apps/bioschemas-api/request_example.txt b/apps/bioschemas-api/request_example.txt new file mode 100644 index 00000000..8aa605f1 --- /dev/null +++ b/apps/bioschemas-api/request_example.txt @@ -0,0 +1,8 @@ +https://mobidb.org/sitemap2.xml.gz +scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz + +https://proteinensemble.org/sitemap2.xml.gz +scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz + +https://disprot.org/sitemap2.xml.gz +scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/publisher/BioschemasAPIController.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/publisher/BioschemasAPIController.java index 36fe207d..e94fbf8d 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/publisher/BioschemasAPIController.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/publisher/BioschemasAPIController.java @@ -1,17 +1,20 @@ package eu.dnetlib.bioschemas.api.publisher; import eu.dnetlib.bioschemas.api.ServiceScrapeDriver; +import eu.dnetlib.bioschemas.api.scraper.ScrapingExecution; +import eu.dnetlib.bioschemas.api.scraper.ScrapingExecutor; import eu.dnetlib.common.controller.AbstractDnetController; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestMethod; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.bind.annotation.*; +import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.File; import java.io.IOException; @@ -33,16 +36,20 @@ public class BioschemasAPIController extends AbstractDnetController { private static Logger logger = LoggerFactory.getLogger(BioschemasAPIController.class); - @RequestMapping(value = "/startScraping", method = RequestMethod.GET) - public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BioschemasException, IOException { + @Autowired + private ScrapingExecutor scrapingExecutor; + private static final Log log = LogFactory.getLog(BioschemasAPIController.class); + + @GetMapping("/startScraping") + public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) { logger.info(" datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl); + return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr()); + } - String sitemapUrlKey = "loc"; - String outputFilename = datasourceKey.concat(getOutputDataPattern()); - ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename); - service.start(); - return "started"; + @GetMapping("/startScraping/status") + public final ScrapingExecution statusScraping() { + return scrapingExecutor.getLastScrapingExecution(); } @RequestMapping(value = "/getNQuads", method = RequestMethod.GET) diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecution.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecution.java new file mode 100644 index 00000000..85e22d0f --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecution.java @@ -0,0 +1,99 @@ +package eu.dnetlib.bioschemas.api.scraper; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.Date; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +public class ScrapingExecution { + + private String id; + private Long dateStart; + private Long dateEnd; + private ScrapingStatus status = ScrapingStatus.NOT_YET_STARTED; + private String message; + + private static final Log log = LogFactory.getLog(ScrapingExecution.class); + + public ScrapingExecution() {} + + public ScrapingExecution(final String id, final Long dateStart, final Long dateEnd, final ScrapingStatus status, final String message) { + this.id = id; + this.dateStart = dateStart; + this.dateEnd = dateEnd; + this.status = status; + this.message = message; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public Long getDateStart() { + return dateStart; + } + + public void setDateStart(final Long dateStart) { + this.dateStart = dateStart; + } + + public Long getDateEnd() { + return dateEnd; + } + + public void setDateEnd(final Long dateEnd) { + this.dateEnd = dateEnd; + } + + public ScrapingStatus getStatus() { + return status; + } + + public void setStatus(final ScrapingStatus status) { + this.status = status; + } + + public String getMessage() { + return message; + } + + public void setMessage(final String message) { + this.message = message; + } + + public void startNew(final String message) { + setId("scraping-" + UUID.randomUUID()); + setDateStart(System.currentTimeMillis()); + setDateEnd(null); + setStatus(ScrapingStatus.RUNNING); + setMessage(message); + log.info(message); + } + + public void complete() { + setDateEnd(System.currentTimeMillis()); + setStatus(ScrapingStatus.SUCCESS); + + final long millis = getDateEnd() - getDateStart(); + setMessage(String + .format("Scraping completed in %d min, %d sec", TimeUnit.MILLISECONDS.toMinutes(millis), TimeUnit.MILLISECONDS.toSeconds(millis) - + TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis)))); + + log.info(getMessage()); + + } + + public void fail(final Throwable e) { + setDateEnd(new Date().getTime()); + setStatus(ScrapingStatus.FAILED); + setMessage(e.getMessage()); + log.error("Error scraping", e); + } + +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecutor.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecutor.java new file mode 100644 index 00000000..e3ccb6ff --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingExecutor.java @@ -0,0 +1,40 @@ +package eu.dnetlib.bioschemas.api.scraper; + +import eu.dnetlib.bioschemas.api.ServiceScrapeDriver; +import org.springframework.stereotype.Component; + +import javax.servlet.http.HttpServletRequest; + +@Component +public class ScrapingExecutor { + + private final ScrapingExecution lastScrapingExecution = new ScrapingExecution(); + + public ScrapingExecution getLastScrapingExecution() { + return lastScrapingExecution; + } + + public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr) { + synchronized (lastScrapingExecution) { + if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) { + lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr); + new Thread(() -> { + try { + String sitemapUrlKey = "loc"; + String outputFilename = datasourceKey.concat(outputDataPattern); + ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename); + service.start(); + lastScrapingExecution.complete(); + } catch (final Throwable e) { + lastScrapingExecution.fail(e); + } + }).start(); + } else { + final long now = System.currentTimeMillis(); + return new ScrapingExecution(null, now, now, ScrapingStatus.NOT_LAUNCHED, "An other scraping is running"); + } + + } + return lastScrapingExecution; + } +} diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingStatus.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingStatus.java new file mode 100644 index 00000000..3d58ee2c --- /dev/null +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapingStatus.java @@ -0,0 +1,9 @@ +package eu.dnetlib.bioschemas.api.scraper; + +public enum ScrapingStatus { + SUCCESS, + FAILED, + RUNNING, + NOT_LAUNCHED, + NOT_YET_STARTED +}