dnet-applications/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/ServiceScrapeDriver.java

216 lines
6.9 KiB
Java

package eu.dnetlib.bioschemas.api;
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
import eu.dnetlib.bioschemas.api.scraper.BMUSEScraper;
import eu.dnetlib.bioschemas.api.scraper.ScrapeState;
import eu.dnetlib.bioschemas.api.scraper.ScrapeThread;
import eu.dnetlib.bioschemas.api.utils.UrlParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import java.util.Properties;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
*
*/
public class ServiceScrapeDriver {
private static final String propertiesFile = "application.properties";
private int waitTime = 1;
private int numberOfPagesToCrawlInALoop;
private int totalNumberOfPagesToCrawlInASession;
private String outputFolder;
private int pagesCounter = 0;
private int scrapeVersion = 1;
private String sitemapUrl;
private String sitemapURLKey;
private String maxScrapedPages;
private String outputFilename;
private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) {
this.sitemapUrl = sitemapUrl;
this.sitemapURLKey = sitemapURLKey;
this.maxScrapedPages = maxScrapedPages;
this.outputFilename = outputFilename;
}
/**
* Runs the scrape process
*
*/
public void start() throws IOException {
runScrape();
}
/**
* Fires off threads
* Originally designed as a multi-threaded process; now reduced to a single thread as
* the selenium webdriver is too expensive to run multi-threaded. However, the threading
* as been left in situ in case it is useful in the future.
*
*/
private void runScrape() throws IOException {
processProperties();
String url = sitemapUrl.toLowerCase();
Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
Stream<Element> urlStream = null;
if (Objects.nonNull(maxScrapedPages)) {
urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
} else {
urlStream = urls.stream();
}
List<Element> sites = urlStream.collect(Collectors.toList());
logger.info("Pages available for scraping: " + sites.size());
List<CrawlRecord> pagesToPull = generatePagesToPull(sites);
if (pagesToPull.isEmpty()) {
logger.error("Cannot retrieve URLs");
throw new RuntimeException("No pages found from sitemap");
}
ScrapeState scrapeState = new ScrapeState(pagesToPull);
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
scrape1.setName("S1");
scrape1.start();
long startTime = System.nanoTime();
try {
scrape1.join();
} catch (InterruptedException e) {
logger.error("Exception waiting on thread");
e.printStackTrace();
return;
}
if(!scrape1.isFileWritten()) {
logger.error("Could not write output file so shutting down!");
Date date = new Date(System.currentTimeMillis());
logger.info("ENDING CRAWL after failure at: " + formatter.format(date));
return;
}
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
long endTime = System.nanoTime();
long timeElapsed = endTime - startTime;
logger.debug("Time in s to complete: " + timeElapsed / 1e+9);
pagesCounter += numberOfPagesToCrawlInALoop;
logger.debug("ENDED loop");
}
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
File output = new File(outputFolder.concat("/").concat(outputFilename));
if (output.exists()) {
output.delete();
output.createNewFile();
}
FileWriter fileWriter;
BufferedWriter bufferedWriter;
fileWriter = new FileWriter(output.getAbsoluteFile(), true); // true to append
bufferedWriter = new BufferedWriter(fileWriter);
List<CrawlRecord> processed = scrapeState.getPagesProcessed();
for (int i=0;i<processed.size();i++) {
try {
bufferedWriter.write(processed.get(i).getNquads());
bufferedWriter.newLine();
bufferedWriter.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
bufferedWriter.close();
logger.info(" Data stored into "+output.getAbsolutePath());
}
/**
* Get a list of URLs (in the form of CrawlRecords) that need to be scraped
*
* @return List of URLs to be scraped
* @see CrawlRecord
*/
private List<CrawlRecord> generatePagesToPull(List<Element> sites) {
List<CrawlRecord> crawls = sites
.stream()
.map(s -> {
CrawlRecord crawlRecord = new CrawlRecord(s.text());
String[] urlSplitted = crawlRecord.getUrl().split("/");
String name = urlSplitted[urlSplitted.length - 1];
crawlRecord.setName(name);
return crawlRecord;
})
.collect(Collectors.toList());
return crawls;
}
/**
* Updates properties based on properties file in src > main > resources
*
*/
private void processProperties() {
ClassLoader classLoader = ServiceScrapeDriver.class.getClassLoader();
InputStream is = classLoader.getResourceAsStream(propertiesFile);
if(is == null) {
logger.error(" Cannot find " + propertiesFile + " file");
throw new IllegalArgumentException(propertiesFile + "file is not found!");
}
Properties prop = new Properties();
try {
prop.load(is);
} catch (IOException e) {
logger.error(" Cannot load application.properties", e);
System.exit(0);
}
waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
logger.info(" waitTime: " + waitTime);
outputFolder = prop.getProperty("outputFolder").trim();
logger.info(" outputFolder: " + outputFolder);
numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());
logger.info(" totalNumberOfPagesToCrawlInASession: " + totalNumberOfPagesToCrawlInASession);
scrapeVersion = Integer.parseInt(prop.getProperty("scrapeVersion").trim());
logger.info(" scrapeVersion: " + scrapeVersion);
logger.info("\n\n\n");
}
public String getSitemapUrl() {
return sitemapUrl;
}
public String getSitemapURLKey() {
return sitemapURLKey;
}
private String getId(String pageUrl) {
String[] parts = pageUrl.split("/");
return parts[parts.length - 1];
}
}