dnet-applications/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapeThread.java

package eu.dnetlib.bioschemas.api.scraper;

import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
import eu.dnetlib.bioschemas.api.utils.CompressorUtil;
import hwu.elixir.scrape.exceptions.CannotWriteException;
import hwu.elixir.scrape.exceptions.FourZeroFourException;
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
import hwu.elixir.scrape.exceptions.MissingMarkupException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Date;

/**
 *
 * @see BMUSEScraper
 * @see ScrapeState
 *
 */
public class ScrapeThread extends Thread {
	private ScrapeState scrapeState;
	private BMUSEScraper process;
	private int waitTime;
	private boolean fileWritten = true;
	private int scrapeVersion = 1;

	private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class);

	/**
	 * Sets up a thread for actually scrapping.
	 *
	 * @param scraper Scraper that will actually do the scraping.
	 * @param scrapeState Object that maintains state across threads.
	 * @param waitTime    How long (in seconds) thread should wait after scraping
	 *                    page before attempting new page.
	 * @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled.
	 *
	 */
	public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) {
		this.scrapeState = scrapeState;
		process = scraper;
		this.waitTime = waitTime;
		this.scrapeVersion = contextVersion;
	}

	@Override
	/**
	 * Defines high-level process of scraping. Actual scraping done by an
	 * implementation of Scraper. If page scrape successful will add url to
	 * Scrape.sitesScraped
	 *
	 * @see Scraper
	 * @see SimpleScraper
	 */
	public void run() {
		while (scrapeState.pagesLeftToScrape()) {
			CrawlRecord record = scrapeState.getURLToProcess();

			if (record == null)
				break;

			record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId());
			record.setDateScraped(new Date());

			try {
				String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
				logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.getNumberPagesLeftToScrape());
				record.setNquads(CompressorUtil.compressValue(nquads));
				if (!nquads.isEmpty()) {
					scrapeState.addSuccessfulScrapedURL(record);
				} else {
					scrapeState.addFailedToScrapeURL(record);
				}
			} catch(FourZeroFourException fourZeroFourException) {
				scrapeState.setStatusTo404(record);
				fileWritten = false;
			} catch (JsonLDInspectionException je) {
				scrapeState.setStatusToHumanInspection(record);
				fileWritten = false;
			} catch (CannotWriteException cannotWrite) {
				logger.error("Caught cannot read file, setting worked to false!");
				fileWritten = false;
				scrapeState.addFailedToScrapeURL(record);
				return; // no point in continuing
			} catch (MissingMarkupException e) {
				logger.error("Cannot obtain markup from " + record.getUrl() +".");
				fileWritten = false;
				scrapeState.addFailedToScrapeURL(record);
			} catch (Exception e) {
				e.printStackTrace();
			}
			try {
				ScrapeThread.sleep(100 * waitTime);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
		process.shutdown();
	}

	public boolean isFileWritten() {
		return fileWritten;
	}
}