package eu.dnetlib.bmuse_webapp.scraper; import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape; import hwu.elixir.scrape.exceptions.*; import hwu.elixir.scrape.scraper.ScraperFilteredCore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Provides the * actual scraping functionality. * * Scrapes a given URL, converts into NQuads and writes to a file (name derived * from URL). If the file already exists it will be overwritten. * * * @see ScraperFilteredCore * */ public class ServiceScraper extends ScraperFilteredCore { private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class); private StatusOfScrape status= null; /** * Orchestrates the process of scraping a site before converting the extracted * triples to NQuads and writing to a file. * * @param url Site to be scraped * @param contextCounter Number used to generate the named graph/context and * the URLs used to replace blank nodes. * @param outputFolderName Location to which the NQuads will be written * @return True if success; false otherwise * @throws FourZeroFourException * @throws JsonLDInspectionException * @throws CannotWriteException * @throws MissingMarkupException * */ public boolean scrape(String url, Long contextCounter, String outputFolderName, String fileName, StatusOfScrape status) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException { this.status = status; logger.info("scraping "+url + " to "+fileName); return scrape(url, outputFolderName, fileName, contextCounter, true); } @Override /* Now takes account of StateOfCrawl */ protected String wrapHTMLExtraction(String url) throws FourZeroFourException { String html = ""; if (status.equals(StatusOfScrape.UNTRIED) || status.equals(StatusOfScrape.FAILED)) { try { html = getHtmlViaSelenium(url); } catch (SeleniumException e) { // try again try { html = getHtmlViaSelenium(url); } catch (SeleniumException e2) { return ""; } } } else { return ""; } return html; } }