103 lines
3.3 KiB
Java
103 lines
3.3 KiB
Java
package eu.dnetlib.bioschemas.api.scraper;
|
|
|
|
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
|
|
import eu.dnetlib.bioschemas.api.utils.CompressorUtil;
|
|
import hwu.elixir.scrape.exceptions.CannotWriteException;
|
|
import hwu.elixir.scrape.exceptions.FourZeroFourException;
|
|
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
|
|
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
import java.util.Date;
|
|
|
|
/**
|
|
*
|
|
* @see BMUSEScraper
|
|
* @see ScrapeState
|
|
*
|
|
*/
|
|
public class ScrapeThread extends Thread {
|
|
private ScrapeState scrapeState;
|
|
private BMUSEScraper process;
|
|
private int waitTime;
|
|
private boolean fileWritten = true;
|
|
private int scrapeVersion = 1;
|
|
|
|
private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class);
|
|
|
|
/**
|
|
* Sets up a thread for actually scrapping.
|
|
*
|
|
* @param scraper Scraper that will actually do the scraping.
|
|
* @param scrapeState Object that maintains state across threads.
|
|
* @param waitTime How long (in seconds) thread should wait after scraping
|
|
* page before attempting new page.
|
|
* @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled.
|
|
*
|
|
*/
|
|
public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) {
|
|
this.scrapeState = scrapeState;
|
|
process = scraper;
|
|
this.waitTime = waitTime;
|
|
this.scrapeVersion = contextVersion;
|
|
}
|
|
|
|
@Override
|
|
/**
|
|
* Defines high-level process of scraping. Actual scraping done by an
|
|
* implementation of Scraper. If page scrape successful will add url to
|
|
* Scrape.sitesScraped
|
|
*
|
|
* @see Scraper
|
|
* @see SimpleScraper
|
|
*/
|
|
public void run() {
|
|
while (scrapeState.pagesLeftToScrape()) {
|
|
CrawlRecord record = scrapeState.getURLToProcess();
|
|
|
|
if (record == null)
|
|
break;
|
|
|
|
record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId());
|
|
record.setDateScraped(new Date());
|
|
|
|
try {
|
|
String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
|
|
logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.getNumberPagesLeftToScrape());
|
|
record.setNquads(CompressorUtil.compressValue(nquads));
|
|
if (!nquads.isEmpty()) {
|
|
scrapeState.addSuccessfulScrapedURL(record);
|
|
} else {
|
|
scrapeState.addFailedToScrapeURL(record);
|
|
}
|
|
} catch(FourZeroFourException fourZeroFourException) {
|
|
scrapeState.setStatusTo404(record);
|
|
fileWritten = false;
|
|
} catch (JsonLDInspectionException je) {
|
|
scrapeState.setStatusToHumanInspection(record);
|
|
fileWritten = false;
|
|
} catch (CannotWriteException cannotWrite) {
|
|
logger.error("Caught cannot read file, setting worked to false!");
|
|
fileWritten = false;
|
|
scrapeState.addFailedToScrapeURL(record);
|
|
return; // no point in continuing
|
|
} catch (MissingMarkupException e) {
|
|
logger.error("Cannot obtain markup from " + record.getUrl() +".");
|
|
fileWritten = false;
|
|
scrapeState.addFailedToScrapeURL(record);
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
}
|
|
try {
|
|
ScrapeThread.sleep(100 * waitTime);
|
|
} catch (InterruptedException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
process.shutdown();
|
|
}
|
|
|
|
public boolean isFileWritten() {
|
|
return fileWritten;
|
|
}
|
|
} |