dnet-applications/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/ScrapeThread.java

103 lines
3.3 KiB
Java

package eu.dnetlib.bioschemas.api.scraper;
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
import eu.dnetlib.bioschemas.api.utils.CompressorUtil;
import hwu.elixir.scrape.exceptions.CannotWriteException;
import hwu.elixir.scrape.exceptions.FourZeroFourException;
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
import hwu.elixir.scrape.exceptions.MissingMarkupException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Date;
/**
*
* @see BMUSEScraper
* @see ScrapeState
*
*/
public class ScrapeThread extends Thread {
private ScrapeState scrapeState;
private BMUSEScraper process;
private int waitTime;
private boolean fileWritten = true;
private int scrapeVersion = 1;
private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class);
/**
* Sets up a thread for actually scrapping.
*
* @param scraper Scraper that will actually do the scraping.
* @param scrapeState Object that maintains state across threads.
* @param waitTime How long (in seconds) thread should wait after scraping
* page before attempting new page.
* @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled.
*
*/
public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) {
this.scrapeState = scrapeState;
process = scraper;
this.waitTime = waitTime;
this.scrapeVersion = contextVersion;
}
@Override
/**
* Defines high-level process of scraping. Actual scraping done by an
* implementation of Scraper. If page scrape successful will add url to
* Scrape.sitesScraped
*
* @see Scraper
* @see SimpleScraper
*/
public void run() {
while (scrapeState.pagesLeftToScrape()) {
CrawlRecord record = scrapeState.getURLToProcess();
if (record == null)
break;
record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId());
record.setDateScraped(new Date());
try {
String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.getNumberPagesLeftToScrape());
record.setNquads(CompressorUtil.compressValue(nquads));
if (!nquads.isEmpty()) {
scrapeState.addSuccessfulScrapedURL(record);
} else {
scrapeState.addFailedToScrapeURL(record);
}
} catch(FourZeroFourException fourZeroFourException) {
scrapeState.setStatusTo404(record);
fileWritten = false;
} catch (JsonLDInspectionException je) {
scrapeState.setStatusToHumanInspection(record);
fileWritten = false;
} catch (CannotWriteException cannotWrite) {
logger.error("Caught cannot read file, setting worked to false!");
fileWritten = false;
scrapeState.addFailedToScrapeURL(record);
return; // no point in continuing
} catch (MissingMarkupException e) {
logger.error("Cannot obtain markup from " + record.getUrl() +".");
fileWritten = false;
scrapeState.addFailedToScrapeURL(record);
} catch (Exception e) {
e.printStackTrace();
}
try {
ScrapeThread.sleep(100 * waitTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
process.shutdown();
}
public boolean isFileWritten() {
return fileWritten;
}
}