package eu.dnetlib.bioschemas.api; import eu.dnetlib.bioschemas.api.crawl.CrawlRecord; import eu.dnetlib.bioschemas.api.scraper.BMUSEScraper; import eu.dnetlib.bioschemas.api.scraper.ScrapeState; import eu.dnetlib.bioschemas.api.scraper.ScrapeThread; import eu.dnetlib.bioschemas.api.utils.UrlParser; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.*; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.Objects; import java.util.Properties; import java.util.stream.Collectors; import java.util.stream.Stream; /** * Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape. * */ public class ServiceScrapeDriver { private static final String propertiesFile = "application.properties"; private int waitTime = 1; private int numberOfPagesToCrawlInALoop; private int totalNumberOfPagesToCrawlInASession; private String outputFolder; private int pagesCounter = 0; private int scrapeVersion = 1; private String sitemapUrl; private String sitemapURLKey; private String maxScrapedPages; private String outputFilename; private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z"); private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class); public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) { this.sitemapUrl = sitemapUrl; this.sitemapURLKey = sitemapURLKey; this.maxScrapedPages = maxScrapedPages; this.outputFilename = outputFilename; } /** * Runs the scrape process * */ public void start() throws IOException { runScrape(); } /** * Fires off threads * Originally designed as a multi-threaded process; now reduced to a single thread as * the selenium webdriver is too expensive to run multi-threaded. However, the threading * as been left in situ in case it is useful in the future. * */ private void runScrape() throws IOException { processProperties(); String url = sitemapUrl.toLowerCase(); Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey()); Stream urlStream = null; if (Objects.nonNull(maxScrapedPages)) { urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages)); } else { urlStream = urls.stream(); } List sites = urlStream.collect(Collectors.toList()); logger.info("Pages available for scraping: " + sites.size()); List pagesToPull = generatePagesToPull(sites); if (pagesToPull.isEmpty()) { logger.error("Cannot retrieve URLs"); throw new RuntimeException("No pages found from sitemap"); } ScrapeState scrapeState = new ScrapeState(pagesToPull); logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis()))); while (pagesCounter < totalNumberOfPagesToCrawlInASession) { logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession); ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion); scrape1.setName("S1"); scrape1.start(); long startTime = System.nanoTime(); try { scrape1.join(); } catch (InterruptedException e) { logger.error("Exception waiting on thread"); e.printStackTrace(); return; } if(!scrape1.isFileWritten()) { logger.error("Could not write output file so shutting down!"); Date date = new Date(System.currentTimeMillis()); logger.info("ENDING CRAWL after failure at: " + formatter.format(date)); return; } logger.debug("Value of isFileWritten: " + scrape1.isFileWritten()); long endTime = System.nanoTime(); long timeElapsed = endTime - startTime; logger.debug("Time in s to complete: " + timeElapsed / 1e+9); pagesCounter += numberOfPagesToCrawlInALoop; logger.debug("ENDED loop"); } logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis()))); File output = new File(outputFolder.concat("/").concat(outputFilename)); if (output.exists()) { output.delete(); output.createNewFile(); } FileWriter fileWriter; BufferedWriter bufferedWriter; fileWriter = new FileWriter(output.getAbsoluteFile(), true); // true to append bufferedWriter = new BufferedWriter(fileWriter); List processed = scrapeState.getPagesProcessed(); for (int i=0;i generatePagesToPull(List sites) { List crawls = sites .stream() .map(s -> { CrawlRecord crawlRecord = new CrawlRecord(s.text()); String[] urlSplitted = crawlRecord.getUrl().split("/"); String name = urlSplitted[urlSplitted.length - 1]; crawlRecord.setName(name); return crawlRecord; }) .collect(Collectors.toList()); return crawls; } /** * Updates properties based on properties file in src > main > resources * */ private void processProperties() { ClassLoader classLoader = ServiceScrapeDriver.class.getClassLoader(); InputStream is = classLoader.getResourceAsStream(propertiesFile); if(is == null) { logger.error(" Cannot find " + propertiesFile + " file"); throw new IllegalArgumentException(propertiesFile + "file is not found!"); } Properties prop = new Properties(); try { prop.load(is); } catch (IOException e) { logger.error(" Cannot load application.properties", e); System.exit(0); } waitTime = Integer.parseInt(prop.getProperty("waitTime").trim()); logger.info(" waitTime: " + waitTime); outputFolder = prop.getProperty("outputFolder").trim(); logger.info(" outputFolder: " + outputFolder); numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim()); logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop); totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim()); logger.info(" totalNumberOfPagesToCrawlInASession: " + totalNumberOfPagesToCrawlInASession); scrapeVersion = Integer.parseInt(prop.getProperty("scrapeVersion").trim()); logger.info(" scrapeVersion: " + scrapeVersion); logger.info("\n\n\n"); } public String getSitemapUrl() { return sitemapUrl; } public String getSitemapURLKey() { return sitemapURLKey; } private String getId(String pageUrl) { String[] parts = pageUrl.split("/"); return parts[parts.length - 1]; } }