216 lines
6.9 KiB
Java
216 lines
6.9 KiB
Java
package eu.dnetlib.bioschemas.api;
|
|
|
|
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
|
|
import eu.dnetlib.bioschemas.api.scraper.BMUSEScraper;
|
|
import eu.dnetlib.bioschemas.api.scraper.ScrapeState;
|
|
import eu.dnetlib.bioschemas.api.scraper.ScrapeThread;
|
|
import eu.dnetlib.bioschemas.api.utils.UrlParser;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.*;
|
|
import java.text.SimpleDateFormat;
|
|
import java.util.Date;
|
|
import java.util.List;
|
|
import java.util.Objects;
|
|
import java.util.Properties;
|
|
import java.util.stream.Collectors;
|
|
import java.util.stream.Stream;
|
|
|
|
|
|
/**
|
|
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
|
|
*
|
|
*/
|
|
public class ServiceScrapeDriver {
|
|
|
|
private static final String propertiesFile = "application.properties";
|
|
|
|
private int waitTime = 1;
|
|
private int numberOfPagesToCrawlInALoop;
|
|
private int totalNumberOfPagesToCrawlInASession;
|
|
private String outputFolder;
|
|
private int pagesCounter = 0;
|
|
private int scrapeVersion = 1;
|
|
|
|
private String sitemapUrl;
|
|
private String sitemapURLKey;
|
|
private String maxScrapedPages;
|
|
private String outputFilename;
|
|
|
|
private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
|
|
|
|
private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
|
|
|
|
public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) {
|
|
this.sitemapUrl = sitemapUrl;
|
|
this.sitemapURLKey = sitemapURLKey;
|
|
this.maxScrapedPages = maxScrapedPages;
|
|
this.outputFilename = outputFilename;
|
|
}
|
|
|
|
/**
|
|
* Runs the scrape process
|
|
*
|
|
*/
|
|
public void start() throws IOException {
|
|
runScrape();
|
|
}
|
|
|
|
/**
|
|
* Fires off threads
|
|
* Originally designed as a multi-threaded process; now reduced to a single thread as
|
|
* the selenium webdriver is too expensive to run multi-threaded. However, the threading
|
|
* as been left in situ in case it is useful in the future.
|
|
*
|
|
*/
|
|
private void runScrape() throws IOException {
|
|
processProperties();
|
|
String url = sitemapUrl.toLowerCase();
|
|
Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
|
|
Stream<Element> urlStream = null;
|
|
if (Objects.nonNull(maxScrapedPages)) {
|
|
urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
|
|
} else {
|
|
urlStream = urls.stream();
|
|
}
|
|
List<Element> sites = urlStream.collect(Collectors.toList());
|
|
logger.info("Pages available for scraping: " + sites.size());
|
|
|
|
List<CrawlRecord> pagesToPull = generatePagesToPull(sites);
|
|
if (pagesToPull.isEmpty()) {
|
|
logger.error("Cannot retrieve URLs");
|
|
throw new RuntimeException("No pages found from sitemap");
|
|
}
|
|
|
|
ScrapeState scrapeState = new ScrapeState(pagesToPull);
|
|
|
|
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
|
while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
|
|
logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
|
|
|
|
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
|
|
scrape1.setName("S1");
|
|
scrape1.start();
|
|
long startTime = System.nanoTime();
|
|
|
|
try {
|
|
scrape1.join();
|
|
} catch (InterruptedException e) {
|
|
logger.error("Exception waiting on thread");
|
|
e.printStackTrace();
|
|
return;
|
|
}
|
|
|
|
if(!scrape1.isFileWritten()) {
|
|
logger.error("Could not write output file so shutting down!");
|
|
Date date = new Date(System.currentTimeMillis());
|
|
logger.info("ENDING CRAWL after failure at: " + formatter.format(date));
|
|
return;
|
|
}
|
|
|
|
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
|
|
long endTime = System.nanoTime();
|
|
long timeElapsed = endTime - startTime;
|
|
logger.debug("Time in s to complete: " + timeElapsed / 1e+9);
|
|
pagesCounter += numberOfPagesToCrawlInALoop;
|
|
logger.debug("ENDED loop");
|
|
}
|
|
|
|
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
|
|
|
File output = new File(outputFolder.concat("/").concat(outputFilename));
|
|
if (output.exists()) {
|
|
output.delete();
|
|
output.createNewFile();
|
|
}
|
|
FileWriter fileWriter;
|
|
BufferedWriter bufferedWriter;
|
|
fileWriter = new FileWriter(output.getAbsoluteFile(), true); // true to append
|
|
bufferedWriter = new BufferedWriter(fileWriter);
|
|
|
|
List<CrawlRecord> processed = scrapeState.getPagesProcessed();
|
|
for (int i=0;i<processed.size();i++) {
|
|
try {
|
|
bufferedWriter.write(processed.get(i).getNquads());
|
|
bufferedWriter.newLine();
|
|
bufferedWriter.flush();
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
bufferedWriter.close();
|
|
logger.info(" Data stored into "+output.getAbsolutePath());
|
|
}
|
|
|
|
/**
|
|
* Get a list of URLs (in the form of CrawlRecords) that need to be scraped
|
|
*
|
|
* @return List of URLs to be scraped
|
|
* @see CrawlRecord
|
|
*/
|
|
private List<CrawlRecord> generatePagesToPull(List<Element> sites) {
|
|
List<CrawlRecord> crawls = sites
|
|
.stream()
|
|
.map(s -> {
|
|
CrawlRecord crawlRecord = new CrawlRecord(s.text());
|
|
String[] urlSplitted = crawlRecord.getUrl().split("/");
|
|
String name = urlSplitted[urlSplitted.length - 1];
|
|
crawlRecord.setName(name);
|
|
return crawlRecord;
|
|
})
|
|
.collect(Collectors.toList());
|
|
return crawls;
|
|
}
|
|
|
|
/**
|
|
* Updates properties based on properties file in src > main > resources
|
|
*
|
|
*/
|
|
private void processProperties() {
|
|
ClassLoader classLoader = ServiceScrapeDriver.class.getClassLoader();
|
|
|
|
InputStream is = classLoader.getResourceAsStream(propertiesFile);
|
|
if(is == null) {
|
|
logger.error(" Cannot find " + propertiesFile + " file");
|
|
throw new IllegalArgumentException(propertiesFile + "file is not found!");
|
|
}
|
|
|
|
Properties prop = new Properties();
|
|
|
|
try {
|
|
prop.load(is);
|
|
} catch (IOException e) {
|
|
logger.error(" Cannot load application.properties", e);
|
|
System.exit(0);
|
|
}
|
|
|
|
waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
|
|
logger.info(" waitTime: " + waitTime);
|
|
outputFolder = prop.getProperty("outputFolder").trim();
|
|
logger.info(" outputFolder: " + outputFolder);
|
|
numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
|
|
logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
|
|
totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());
|
|
logger.info(" totalNumberOfPagesToCrawlInASession: " + totalNumberOfPagesToCrawlInASession);
|
|
scrapeVersion = Integer.parseInt(prop.getProperty("scrapeVersion").trim());
|
|
logger.info(" scrapeVersion: " + scrapeVersion);
|
|
logger.info("\n\n\n");
|
|
}
|
|
|
|
public String getSitemapUrl() {
|
|
return sitemapUrl;
|
|
}
|
|
|
|
public String getSitemapURLKey() {
|
|
return sitemapURLKey;
|
|
}
|
|
|
|
private String getId(String pageUrl) {
|
|
String[] parts = pageUrl.split("/");
|
|
return parts[parts.length - 1];
|
|
}
|
|
}
|