package eu.dnetlib.bmuse_webapp.scraper; import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord; import eu.dnetlib.bmuse_webapp.utils.CompressorUtil; import hwu.elixir.scrape.exceptions.CannotWriteException; import hwu.elixir.scrape.exceptions.FourZeroFourException; import hwu.elixir.scrape.exceptions.JsonLDInspectionException; import hwu.elixir.scrape.exceptions.MissingMarkupException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Date; /** * * @see BMUSEScraper * @see ScrapeState * */ public class ScrapeThread extends Thread { private ScrapeState scrapeState; private BMUSEScraper process; private int waitTime; private boolean fileWritten = true; private int scrapeVersion = 1; private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class); /** * Sets up a thread for actually scrapping. * * @param scraper Scraper that will actually do the scraping. * @param scrapeState Object that maintains state across threads. * @param waitTime How long (in seconds) thread should wait after scraping * page before attempting new page. * @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled. * */ public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) { this.scrapeState = scrapeState; process = scraper; this.waitTime = waitTime; this.scrapeVersion = contextVersion; } @Override /** * Defines high-level process of scraping. Actual scraping done by an * implementation of Scraper. If page scrape successful will add url to * Scrape.sitesScraped * * @see Scraper * @see SimpleScraper */ public void run() { while (scrapeState.pagesLeftToScrape()) { CrawlRecord record = scrapeState.getURLToProcess(); if (record == null) break; record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId()); record.setDateScraped(new Date()); try { String nquads = process.getNQUADSFromUrl(record.getUrl(), true); logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.pagesLeftToScrape()); record.setNquads(CompressorUtil.compressValue(nquads)); if (!nquads.isEmpty()) { scrapeState.addSuccessfulScrapedURL(record); } else { scrapeState.addFailedToScrapeURL(record); } } catch(FourZeroFourException fourZeroFourException) { scrapeState.setStatusTo404(record); fileWritten = false; } catch (JsonLDInspectionException je) { scrapeState.setStatusToHumanInspection(record); fileWritten = false; } catch (CannotWriteException cannotWrite) { logger.error("Caught cannot read file, setting worked to false!"); fileWritten = false; scrapeState.addFailedToScrapeURL(record); return; // no point in continuing } catch (MissingMarkupException e) { logger.error("Cannot obtain markup from " + record.getUrl() +"."); fileWritten = false; scrapeState.addFailedToScrapeURL(record); } catch (Exception e) { e.printStackTrace(); } try { ScrapeThread.sleep(100 * waitTime); } catch (InterruptedException e) { e.printStackTrace(); } } process.shutdown(); } public boolean isFileWritten() { return fileWritten; } }