dnet-applications/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java

72 lines
2.1 KiB
Java

package eu.dnetlib.bmuse_webapp.scraper;
import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
import hwu.elixir.scrape.exceptions.*;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Provides the
* actual scraping functionality.
*
* Scrapes a given URL, converts into NQuads and writes to a file (name derived
* from URL). If the file already exists it will be overwritten.
*
*
* @see ScraperFilteredCore
*
*/
public class ServiceScraper extends ScraperFilteredCore {
private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class);
private StatusOfScrape status= null;
/**
* Orchestrates the process of scraping a site before converting the extracted
* triples to NQuads and writing to a file.
*
* @param url Site to be scraped
* @param contextCounter Number used to generate the named graph/context and
* the URLs used to replace blank nodes.
* @param outputFolderName Location to which the NQuads will be written
* @return True if success; false otherwise
* @throws FourZeroFourException
* @throws JsonLDInspectionException
* @throws CannotWriteException
* @throws MissingMarkupException
*
*/
public boolean scrape(String url, Long contextCounter, String outputFolderName, String fileName, StatusOfScrape status) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException {
this.status = status;
logger.info("scraping "+url + " to "+fileName);
return scrape(url, outputFolderName, fileName, contextCounter, true);
}
@Override
/* Now takes account of StateOfCrawl
*/
protected String wrapHTMLExtraction(String url) throws FourZeroFourException {
String html = "";
if (status.equals(StatusOfScrape.UNTRIED) || status.equals(StatusOfScrape.FAILED)) {
try {
html = getHtmlViaSelenium(url);
} catch (SeleniumException e) {
// try again
try {
html = getHtmlViaSelenium(url);
} catch (SeleniumException e2) {
return "";
}
}
} else {
return "";
}
return html;
}
}