72 lines
2.1 KiB
Java
72 lines
2.1 KiB
Java
package eu.dnetlib.bioschemas.api.scraper;
|
|
|
|
import eu.dnetlib.bioschemas.api.crawl.StatusOfScrape;
|
|
import hwu.elixir.scrape.exceptions.*;
|
|
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
/**
|
|
* Provides the
|
|
* actual scraping functionality.
|
|
*
|
|
* Scrapes a given URL, converts into NQuads and writes to a file (name derived
|
|
* from URL). If the file already exists it will be overwritten.
|
|
*
|
|
*
|
|
* @see ScraperFilteredCore
|
|
*
|
|
*/
|
|
public class ServiceScraper extends ScraperFilteredCore {
|
|
|
|
private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class);
|
|
|
|
private StatusOfScrape status= null;
|
|
|
|
|
|
/**
|
|
* Orchestrates the process of scraping a site before converting the extracted
|
|
* triples to NQuads and writing to a file.
|
|
*
|
|
* @param url Site to be scraped
|
|
* @param contextCounter Number used to generate the named graph/context and
|
|
* the URLs used to replace blank nodes.
|
|
* @param outputFolderName Location to which the NQuads will be written
|
|
* @return True if success; false otherwise
|
|
* @throws FourZeroFourException
|
|
* @throws JsonLDInspectionException
|
|
* @throws CannotWriteException
|
|
* @throws MissingMarkupException
|
|
*
|
|
*/
|
|
public boolean scrape(String url, Long contextCounter, String outputFolderName, String fileName, StatusOfScrape status) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException {
|
|
this.status = status;
|
|
logger.info("scraping "+url + " to "+fileName);
|
|
return scrape(url, outputFolderName, fileName, contextCounter, true);
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
/* Now takes account of StateOfCrawl
|
|
*/
|
|
protected String wrapHTMLExtraction(String url) throws FourZeroFourException {
|
|
String html = "";
|
|
if (status.equals(StatusOfScrape.UNTRIED) || status.equals(StatusOfScrape.FAILED)) {
|
|
try {
|
|
html = getHtmlViaSelenium(url);
|
|
} catch (SeleniumException e) {
|
|
// try again
|
|
try {
|
|
html = getHtmlViaSelenium(url);
|
|
} catch (SeleniumException e2) {
|
|
return "";
|
|
}
|
|
}
|
|
} else {
|
|
return "";
|
|
}
|
|
return html;
|
|
}
|
|
}
|