Merge pull request 'bioschemas' (#6) from bioschemas into master

Reviewed-on: #6
This commit is contained in:
Michele Artini 2022-06-08 10:19:38 +02:00
commit 3440383904
23 changed files with 1370 additions and 0 deletions

View File

@ -0,0 +1,10 @@
{
"type_source": "SVN",
"goal": "package -U source:jar",
"url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bioschemas-api/trunk/",
"deploy_repository": "dnet5-snapshots",
"version": "5",
"mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it, enrico.ottonello@isti.cnr.it",
"deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots",
"name": "dnet-ariadneplus-graphdb-publisher"
}

View File

@ -0,0 +1,62 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>apps</artifactId>
<version>3.2.8-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>jar</packaging>
<artifactId>bioschemas-api</artifactId>
<dependencies>
<dependency>
<groupId>hwu.elixir</groupId>
<artifactId>bmuse-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-core</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-rio-rdfxml</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-model</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${bioschemas-commons-io-version}</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-help-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,8 @@
https://mobidb.org/sitemap2.xml.gz
scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
https://proteinensemble.org/sitemap2.xml.gz
scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
https://disprot.org/sitemap2.xml.gz
scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz

View File

@ -0,0 +1,14 @@
package eu.dnetlib.bioschemas.api;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Profile;
/**
* @author enrico.ottonello
*
*/
@Profile("garr")
@Configuration
public class AppConfigGarr {
}

View File

@ -0,0 +1,42 @@
package eu.dnetlib.bioschemas.api;
import eu.dnetlib.common.app.AbstractDnetApp;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.cache.annotation.EnableCaching;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.scheduling.annotation.EnableScheduling;
import springfox.documentation.builders.ApiInfoBuilder;
import springfox.documentation.builders.RequestHandlerSelectors;
import springfox.documentation.service.ApiInfo;
import springfox.documentation.spring.web.plugins.Docket;
import springfox.documentation.swagger2.annotations.EnableSwagger2;
@SpringBootApplication
@EnableSwagger2
@EnableCaching
@EnableScheduling
@ComponentScan(basePackages = "eu.dnetlib")
public class MainApplication extends AbstractDnetApp {
public static void main(final String[] args) {
SpringApplication.run(MainApplication.class, args);
}
@Override
protected void configSwagger(final Docket docket) {
docket.select()
.apis(RequestHandlerSelectors.any())
.paths(p -> p.contains("/api/"))
.build()
.apiInfo(new ApiInfoBuilder()
.title("D-Net Bioschemas Service APIs")
.description("APIs documentation")
.version("1.1")
.contact(ApiInfo.DEFAULT_CONTACT)
.license("Apache 2.0")
.licenseUrl("http://www.apache.org/licenses/LICENSE-2.0")
.build());
}
}

View File

@ -0,0 +1,215 @@
package eu.dnetlib.bioschemas.api;
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
import eu.dnetlib.bioschemas.api.scraper.BMUSEScraper;
import eu.dnetlib.bioschemas.api.scraper.ScrapeState;
import eu.dnetlib.bioschemas.api.scraper.ScrapeThread;
import eu.dnetlib.bioschemas.api.utils.UrlParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import java.util.Properties;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
*
*/
public class ServiceScrapeDriver {
private static final String propertiesFile = "application.properties";
private int waitTime = 1;
private int numberOfPagesToCrawlInALoop;
private int totalNumberOfPagesToCrawlInASession;
private String outputFolder;
private int pagesCounter = 0;
private int scrapeVersion = 1;
private String sitemapUrl;
private String sitemapURLKey;
private String maxScrapedPages;
private String outputFilename;
private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) {
this.sitemapUrl = sitemapUrl;
this.sitemapURLKey = sitemapURLKey;
this.maxScrapedPages = maxScrapedPages;
this.outputFilename = outputFilename;
}
/**
* Runs the scrape process
*
*/
public void start() throws IOException {
runScrape();
}
/**
* Fires off threads
* Originally designed as a multi-threaded process; now reduced to a single thread as
* the selenium webdriver is too expensive to run multi-threaded. However, the threading
* as been left in situ in case it is useful in the future.
*
*/
private void runScrape() throws IOException {
processProperties();
String url = sitemapUrl.toLowerCase();
Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
Stream<Element> urlStream = null;
if (Objects.nonNull(maxScrapedPages)) {
urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
} else {
urlStream = urls.stream();
}
List<Element> sites = urlStream.collect(Collectors.toList());
logger.info("Pages available for scraping: " + sites.size());
List<CrawlRecord> pagesToPull = generatePagesToPull(sites);
if (pagesToPull.isEmpty()) {
logger.error("Cannot retrieve URLs");
throw new RuntimeException("No pages found from sitemap");
}
ScrapeState scrapeState = new ScrapeState(pagesToPull);
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
scrape1.setName("S1");
scrape1.start();
long startTime = System.nanoTime();
try {
scrape1.join();
} catch (InterruptedException e) {
logger.error("Exception waiting on thread");
e.printStackTrace();
return;
}
if(!scrape1.isFileWritten()) {
logger.error("Could not write output file so shutting down!");
Date date = new Date(System.currentTimeMillis());
logger.info("ENDING CRAWL after failure at: " + formatter.format(date));
return;
}
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
long endTime = System.nanoTime();
long timeElapsed = endTime - startTime;
logger.debug("Time in s to complete: " + timeElapsed / 1e+9);
pagesCounter += numberOfPagesToCrawlInALoop;
logger.debug("ENDED loop");
}
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
File output = new File(outputFolder.concat("/").concat(outputFilename));
if (output.exists()) {
output.delete();
output.createNewFile();
}
FileWriter fileWriter;
BufferedWriter bufferedWriter;
fileWriter = new FileWriter(output.getAbsoluteFile(), true); // true to append
bufferedWriter = new BufferedWriter(fileWriter);
List<CrawlRecord> processed = scrapeState.getPagesProcessed();
for (int i=0;i<processed.size();i++) {
try {
bufferedWriter.write(processed.get(i).getNquads());
bufferedWriter.newLine();
bufferedWriter.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
bufferedWriter.close();
logger.info(" Data stored into "+output.getAbsolutePath());
}
/**
* Get a list of URLs (in the form of CrawlRecords) that need to be scraped
*
* @return List of URLs to be scraped
* @see CrawlRecord
*/
private List<CrawlRecord> generatePagesToPull(List<Element> sites) {
List<CrawlRecord> crawls = sites
.stream()
.map(s -> {
CrawlRecord crawlRecord = new CrawlRecord(s.text());
String[] urlSplitted = crawlRecord.getUrl().split("/");
String name = urlSplitted[urlSplitted.length - 1];
crawlRecord.setName(name);
return crawlRecord;
})
.collect(Collectors.toList());
return crawls;
}
/**
* Updates properties based on properties file in src > main > resources
*
*/
private void processProperties() {
ClassLoader classLoader = ServiceScrapeDriver.class.getClassLoader();
InputStream is = classLoader.getResourceAsStream(propertiesFile);
if(is == null) {
logger.error(" Cannot find " + propertiesFile + " file");
throw new IllegalArgumentException(propertiesFile + "file is not found!");
}
Properties prop = new Properties();
try {
prop.load(is);
} catch (IOException e) {
logger.error(" Cannot load application.properties", e);
System.exit(0);
}
waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
logger.info(" waitTime: " + waitTime);
outputFolder = prop.getProperty("outputFolder").trim();
logger.info(" outputFolder: " + outputFolder);
numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());
logger.info(" totalNumberOfPagesToCrawlInASession: " + totalNumberOfPagesToCrawlInASession);
scrapeVersion = Integer.parseInt(prop.getProperty("scrapeVersion").trim());
logger.info(" scrapeVersion: " + scrapeVersion);
logger.info("\n\n\n");
}
public String getSitemapUrl() {
return sitemapUrl;
}
public String getSitemapURLKey() {
return sitemapURLKey;
}
private String getId(String pageUrl) {
String[] parts = pageUrl.split("/");
return parts[parts.length - 1];
}
}

View File

@ -0,0 +1,79 @@
package eu.dnetlib.bioschemas.api.controller;
import eu.dnetlib.bioschemas.api.scraper.ScrapingExecution;
import eu.dnetlib.bioschemas.api.scraper.ScrapingExecutor;
import eu.dnetlib.bioschemas.api.utils.BioschemasException;
import eu.dnetlib.common.controller.AbstractDnetController;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.web.bind.annotation.*;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* @author enrico.ottonello
*
*/
@RestController
@RequestMapping("/api")
public class BioschemasAPIController extends AbstractDnetController {
@Value("${outputFolder}")
private String outputFolder;
@Value("${outputDataPattern}")
private String outputDataPattern;
private static Logger logger = LoggerFactory.getLogger(BioschemasAPIController.class);
@Autowired
private ScrapingExecutor scrapingExecutor;
private static final Log log = LogFactory.getLog(BioschemasAPIController.class);
@GetMapping("/startScraping")
public ScrapingExecution startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl, final HttpServletRequest req) {
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
return scrapingExecutor.startScraping(datasourceKey, sitemapUrl, getOutputDataPattern(), req.getRemoteAddr());
}
@GetMapping("/startScraping/status")
public final ScrapingExecution statusScraping() {
return scrapingExecutor.getLastScrapingExecution();
}
@RequestMapping(value = "/getNQuads", method = RequestMethod.GET)
public String getNQuads(@RequestParam final String datasourceKey, HttpServletResponse response) throws BioschemasException, IOException {
logger.info("<GETNQUADS> datasourceKey: "+datasourceKey);
LineIterator it = FileUtils.lineIterator(new File(getOutputFolder().concat("/").concat(datasourceKey).concat(getOutputDataPattern())), "UTF-8");
try {
while (it.hasNext()) {
String line = it.nextLine();
response.getOutputStream().write(line.getBytes(StandardCharsets.UTF_8));
response.getOutputStream().println();
}
} finally {
}
return "";
}
public String getOutputFolder() {
return outputFolder;
}
public String getOutputDataPattern() {
return outputDataPattern;
}
}

View File

@ -0,0 +1,17 @@
package eu.dnetlib.bioschemas.api.controller;
import eu.dnetlib.common.controller.AbstractDnetController;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.GetMapping;
@Controller
public class HomeController extends AbstractDnetController {
@GetMapping({
"/doc", "/swagger"
})
public String apiDoc() {
return "redirect:swagger-ui/";
}
}

View File

@ -0,0 +1,136 @@
package eu.dnetlib.bioschemas.api.crawl;
import java.util.Date;
import hwu.elixir.utils.Validation;
/**
*
* Store the current status of a single URL in the scrape service.
*
*
*/
public class CrawlRecord {
private Long id;
private String context = "";
private String url;
private Date dateScraped;
private StatusOfScrape status;
private boolean beingScraped;
private String name;
private String nquads;
public CrawlRecord() {
status = StatusOfScrape.UNTRIED;
}
public CrawlRecord(String url) {
Validation validation = new Validation();
if(validation.validateURI(url)) {
this.url = url;
context = "";
status = StatusOfScrape.UNTRIED;
dateScraped = null;
} else {
throw new IllegalArgumentException(url +" is not a valid url");
}
this.setId(System.currentTimeMillis());
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getUrl() {
return url;
}
public Date getDateScraped() {
return dateScraped;
}
public void setDateScraped(Date dateScraped) {
this.dateScraped = dateScraped;
}
public StatusOfScrape getStatus() {
return status;
}
public void setStatus(StatusOfScrape status) {
this.status = status;
}
public String getContext() {
return context;
}
public void setContext(String context) {
this.context = context;
}
public boolean isBeingScraped() {
return beingScraped;
}
public void setBeingScraped(boolean beingScraped) {
this.beingScraped = beingScraped;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getNquads() {
return nquads;
}
public void setNquads(String nquads) {
this.nquads = nquads;
}
@Override
public boolean equals(Object o) {
if (this == o)
return true;
if (!(o instanceof CrawlRecord))
return false;
CrawlRecord otherCrawl = (CrawlRecord) o;
if(this.url.equals(otherCrawl.getUrl())) {
return true;
}
return false;
}
@Override
public int hashCode() {
int result = getId() != null ? getId().hashCode() : 0;
result = 31 * result + (getUrl() != null ? getUrl().hashCode() : 0);
result = 31 * result + (getContext() != null ? getContext().hashCode() : 0);
result = 31 * result + (getDateScraped() != null ? getDateScraped().hashCode() : 0);
return result;
}
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.bioschemas.api.crawl;
/**
*
* {@link StatusOfScrape} describes the possible status levels the scrape for each URL/CrawlRecord.
*
* Each URL/CrawlRecord can have one of the following:
* DOES_NOT_EXIST = 404.
* HUMAN_INSPECTION = cannot parse for some reason; a human should see what is happening.
* UNTRIED = not scraped yet.
* FAILED = one failed attempt at scraping; will try again.
* GIVEN_UP = two failed attempts at scraping. Will not try again.
* SUCCESS = successfully scraped.
*
*/
public enum StatusOfScrape {
DOES_NOT_EXIST, HUMAN_INSPECTION, UNTRIED, FAILED, GIVEN_UP, SUCCESS;
}

View File

@ -0,0 +1,87 @@
package eu.dnetlib.bioschemas.api.scraper;
import hwu.elixir.scrape.exceptions.MissingMarkupException;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.Rio;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.StringWriter;
public class BMUSEScraper extends ScraperFilteredCore {
private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class);
public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
logger.debug(url + " > scraping");
url = fixURL(url);
String html = "";
// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
// (dynamic and static respectively)
if (dynamic) {
html = wrapHTMLExtraction(url);
} else {
html = wrapHTMLExtractionStatic(url);
}
if (html == null || html.contentEquals(""))
throw new Exception("empty html");
html = injectId(html, url);
logger.debug(url + " > html scraped from " + url);
DocumentSource source = new StringDocumentSource(html, url);
String n3 = html2Triples(source, url);
if (n3 == null) {
throw new MissingMarkupException(url);
}
logger.debug(url + " > processing triples");
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
if (updatedModel == null) {
throw new Exception("rdf model null");
}
logger.debug(url + " > generating nquads");
try (StringWriter jsonLDWriter = new StringWriter()) {
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
logger.debug(url + " > nquads generated");
return jsonLDWriter.toString();
} catch (Exception e) {
throw e;
}
}
private String html2Triples(DocumentSource source, String url) throws Exception {
Any23 runner = new Any23();
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
TripleHandler handler = new NTriplesWriter(out);) {
runner.extract(source, handler);
return out.toString("UTF-8");
} catch (ExtractionException e) {
logger.error("Cannot extract triples", e);
} catch (IOException e1) {
logger.error(" IO error whilst extracting triples", e1);
} catch (TripleHandlerException e2) {
logger.error("TripleHanderException", e2);
}
return null;
}
}

View File

@ -0,0 +1,157 @@
package eu.dnetlib.bioschemas.api.scraper;
import eu.dnetlib.bioschemas.api.crawl.StatusOfScrape;
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
*/
public class ScrapeState {
private List<CrawlRecord> urlsToScrape = Collections.synchronizedList(new ArrayList<CrawlRecord>());
private List<CrawlRecord> urlsProcessed = Collections.synchronizedList(new ArrayList<CrawlRecord>()); // should this be a set?
private Map<String, Object> nquadsConcurrentHashMap = new ConcurrentHashMap<>();
/**
*
* @param pagesToBeScraped The list of sites to be scraped
* @see ScrapeThread
* @see CrawlRecord
*/
public ScrapeState(List<CrawlRecord> pagesToBeScraped) {
urlsToScrape.addAll(pagesToBeScraped);
}
/**
* Any pages/URLs left to scrape?
* @return True for yes & false for no
* @see CrawlRecord
*/
public synchronized boolean pagesLeftToScrape() {
return !urlsToScrape.isEmpty();
}
/**
* Returns the next URL/CrawlRecord to be scraped
*
* @return First page/URL that needs to be scraped next
* @see CrawlRecord
*/
public synchronized CrawlRecord getURLToProcess() {
if (urlsToScrape.isEmpty())
return null;
return urlsToScrape.remove(0);
}
/**
* Adds the given CrawlRecord to the list of CrawlRecords successfully scraped.
* Updates the status of the CrawlRecord to SUCCESS.
*
* @param url The latest URL/page that has been successfully scraped
* @see CrawlRecord
*/
public synchronized void addSuccessfulScrapedURL(CrawlRecord record) {
record.setStatus(StatusOfScrape.SUCCESS);
urlsProcessed.add(record);
}
/**
* Adds the given CrawlRecord to the list of CrawlRecords NOT successfully scraped.
* Updates the status of the CrawlRecord; if first failure the status is FAILED.
* If status is already FAILED it is changed to GIVEN_UP.
*
* If the status is FAILED, another try will be made in a future run.
*
*
* @param url The latest URL/page that has been unsuccessfully scraped
* @see CrawlRecord
*/
public synchronized void addFailedToScrapeURL(CrawlRecord record) {
if (record.getStatus().equals(StatusOfScrape.FAILED)) {
record.setStatus(StatusOfScrape.GIVEN_UP);
} else {
record.setStatus(StatusOfScrape.FAILED);
}
urlsProcessed.add(record);
}
/**
* Changes the status of the CrawlRecord to DOES_NOT_EXIST.
* As Selenium does not return the HTTP codes, it is questionable
* how useful this is.
*
*
* @param url The latest URL/page that has been 404'd
* @see CrawlRecord
*/
public synchronized void setStatusTo404(CrawlRecord record) {
record.setStatus(StatusOfScrape.DOES_NOT_EXIST);
urlsProcessed.add(record);
}
/**
*
* Changes the status of the CrawlRecord to HUMAN_INSPECTION.
* This captures the idea that the URLs may contain unexpected markup that needs a human to
* review and possibly update the scraper.
*
* @param url The latest URL/page that needs human inspection
* @see CrawlRecord
*/
public synchronized void setStatusToHumanInspection(CrawlRecord record) {
record.setStatus(StatusOfScrape.HUMAN_INSPECTION);
urlsProcessed.add(record);
}
/**
* Returns the number of URLs that are still to be scraped in this cycle.
* This does not return the number of URLs left to scrape in the DBMS, just in the current cycle.
*
* @return Number of URLs left to scrape in this cycle
* @see CrawlRecord
*/
public synchronized int getNumberPagesLeftToScrape() {
return urlsToScrape.size();
}
/**
* Gets the full list of URLs that have been processed in this cycle.
* This does not return the number of URLs that have been scraped in total across all cycles.
*
* @return
* @see CrawlRecord
*/
public synchronized List<CrawlRecord> getPagesProcessed() {
return urlsProcessed;
}
/**
* Gets the full list of URLs/CrawlRecords regardless of whether scraped or not in the current cycle.
*
* @return List of all CrawlRecords in this cycle.
* @see CrawlRecord
*/
public synchronized List<CrawlRecord> getPagesProcessedAndUnprocessed() {
List<CrawlRecord> urlsCombined = Collections.synchronizedList(new ArrayList<CrawlRecord>());
urlsCombined.addAll(urlsProcessed);
urlsCombined.addAll(urlsToScrape);
return urlsCombined;
}
public void addNquads(String key, String nquads) {
nquadsConcurrentHashMap.putIfAbsent(key, nquads);
}
public Map<String, Object> getNquadsConcurrentHashMap() {
return nquadsConcurrentHashMap;
}
}

View File

@ -0,0 +1,103 @@
package eu.dnetlib.bioschemas.api.scraper;
import eu.dnetlib.bioschemas.api.crawl.CrawlRecord;
import eu.dnetlib.bioschemas.api.utils.CompressorUtil;
import hwu.elixir.scrape.exceptions.CannotWriteException;
import hwu.elixir.scrape.exceptions.FourZeroFourException;
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
import hwu.elixir.scrape.exceptions.MissingMarkupException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Date;
/**
*
* @see BMUSEScraper
* @see ScrapeState
*
*/
public class ScrapeThread extends Thread {
private ScrapeState scrapeState;
private BMUSEScraper process;
private int waitTime;
private boolean fileWritten = true;
private int scrapeVersion = 1;
private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class);
/**
* Sets up a thread for actually scrapping.
*
* @param scraper Scraper that will actually do the scraping.
* @param scrapeState Object that maintains state across threads.
* @param waitTime How long (in seconds) thread should wait after scraping
* page before attempting new page.
* @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled.
*
*/
public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) {
this.scrapeState = scrapeState;
process = scraper;
this.waitTime = waitTime;
this.scrapeVersion = contextVersion;
}
@Override
/**
* Defines high-level process of scraping. Actual scraping done by an
* implementation of Scraper. If page scrape successful will add url to
* Scrape.sitesScraped
*
* @see Scraper
* @see SimpleScraper
*/
public void run() {
while (scrapeState.pagesLeftToScrape()) {
CrawlRecord record = scrapeState.getURLToProcess();
if (record == null)
break;
record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId());
record.setDateScraped(new Date());
try {
String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.getNumberPagesLeftToScrape());
record.setNquads(CompressorUtil.compressValue(nquads));
if (!nquads.isEmpty()) {
scrapeState.addSuccessfulScrapedURL(record);
} else {
scrapeState.addFailedToScrapeURL(record);
}
} catch(FourZeroFourException fourZeroFourException) {
scrapeState.setStatusTo404(record);
fileWritten = false;
} catch (JsonLDInspectionException je) {
scrapeState.setStatusToHumanInspection(record);
fileWritten = false;
} catch (CannotWriteException cannotWrite) {
logger.error("Caught cannot read file, setting worked to false!");
fileWritten = false;
scrapeState.addFailedToScrapeURL(record);
return; // no point in continuing
} catch (MissingMarkupException e) {
logger.error("Cannot obtain markup from " + record.getUrl() +".");
fileWritten = false;
scrapeState.addFailedToScrapeURL(record);
} catch (Exception e) {
e.printStackTrace();
}
try {
ScrapeThread.sleep(100 * waitTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
process.shutdown();
}
public boolean isFileWritten() {
return fileWritten;
}
}

View File

@ -0,0 +1,99 @@
package eu.dnetlib.bioschemas.api.scraper;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Date;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
public class ScrapingExecution {
private String id;
private Long dateStart;
private Long dateEnd;
private ScrapingStatus status = ScrapingStatus.NOT_YET_STARTED;
private String message;
private static final Log log = LogFactory.getLog(ScrapingExecution.class);
public ScrapingExecution() {}
public ScrapingExecution(final String id, final Long dateStart, final Long dateEnd, final ScrapingStatus status, final String message) {
this.id = id;
this.dateStart = dateStart;
this.dateEnd = dateEnd;
this.status = status;
this.message = message;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public Long getDateStart() {
return dateStart;
}
public void setDateStart(final Long dateStart) {
this.dateStart = dateStart;
}
public Long getDateEnd() {
return dateEnd;
}
public void setDateEnd(final Long dateEnd) {
this.dateEnd = dateEnd;
}
public ScrapingStatus getStatus() {
return status;
}
public void setStatus(final ScrapingStatus status) {
this.status = status;
}
public String getMessage() {
return message;
}
public void setMessage(final String message) {
this.message = message;
}
public void startNew(final String message) {
setId("scraping-" + UUID.randomUUID());
setDateStart(System.currentTimeMillis());
setDateEnd(null);
setStatus(ScrapingStatus.RUNNING);
setMessage(message);
log.info(message);
}
public void complete() {
setDateEnd(System.currentTimeMillis());
setStatus(ScrapingStatus.SUCCESS);
final long millis = getDateEnd() - getDateStart();
setMessage(String
.format("Scraping completed in %d min, %d sec", TimeUnit.MILLISECONDS.toMinutes(millis), TimeUnit.MILLISECONDS.toSeconds(millis) -
TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))));
log.info(getMessage());
}
public void fail(final Throwable e) {
setDateEnd(new Date().getTime());
setStatus(ScrapingStatus.FAILED);
setMessage(e.getMessage());
log.error("Error scraping", e);
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.bioschemas.api.scraper;
import eu.dnetlib.bioschemas.api.ServiceScrapeDriver;
import org.springframework.stereotype.Component;
import javax.servlet.http.HttpServletRequest;
@Component
public class ScrapingExecutor {
private final ScrapingExecution lastScrapingExecution = new ScrapingExecution();
public ScrapingExecution getLastScrapingExecution() {
return lastScrapingExecution;
}
public ScrapingExecution startScraping(final String datasourceKey, final String sitemapUrl, final String outputDataPattern, final String remoteAddr) {
synchronized (lastScrapingExecution) {
if (lastScrapingExecution.getStatus() != ScrapingStatus.RUNNING) {
lastScrapingExecution.startNew("Scraping for " + datasourceKey + " " + sitemapUrl + " - request from " + remoteAddr);
new Thread(() -> {
try {
String sitemapUrlKey = "loc";
String outputFilename = datasourceKey.concat(outputDataPattern);
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
service.start();
lastScrapingExecution.complete();
} catch (final Throwable e) {
lastScrapingExecution.fail(e);
}
}).start();
} else {
final long now = System.currentTimeMillis();
return new ScrapingExecution(null, now, now, ScrapingStatus.NOT_LAUNCHED, "An other scraping is running");
}
}
return lastScrapingExecution;
}
}

View File

@ -0,0 +1,9 @@
package eu.dnetlib.bioschemas.api.scraper;
public enum ScrapingStatus {
SUCCESS,
FAILED,
RUNNING,
NOT_LAUNCHED,
NOT_YET_STARTED
}

View File

@ -0,0 +1,71 @@
package eu.dnetlib.bioschemas.api.scraper;
import eu.dnetlib.bioschemas.api.crawl.StatusOfScrape;
import hwu.elixir.scrape.exceptions.*;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Provides the
* actual scraping functionality.
*
* Scrapes a given URL, converts into NQuads and writes to a file (name derived
* from URL). If the file already exists it will be overwritten.
*
*
* @see ScraperFilteredCore
*
*/
public class ServiceScraper extends ScraperFilteredCore {
private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class);
private StatusOfScrape status= null;
/**
* Orchestrates the process of scraping a site before converting the extracted
* triples to NQuads and writing to a file.
*
* @param url Site to be scraped
* @param contextCounter Number used to generate the named graph/context and
* the URLs used to replace blank nodes.
* @param outputFolderName Location to which the NQuads will be written
* @return True if success; false otherwise
* @throws FourZeroFourException
* @throws JsonLDInspectionException
* @throws CannotWriteException
* @throws MissingMarkupException
*
*/
public boolean scrape(String url, Long contextCounter, String outputFolderName, String fileName, StatusOfScrape status) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException {
this.status = status;
logger.info("scraping "+url + " to "+fileName);
return scrape(url, outputFolderName, fileName, contextCounter, true);
}
@Override
/* Now takes account of StateOfCrawl
*/
protected String wrapHTMLExtraction(String url) throws FourZeroFourException {
String html = "";
if (status.equals(StatusOfScrape.UNTRIED) || status.equals(StatusOfScrape.FAILED)) {
try {
html = getHtmlViaSelenium(url);
} catch (SeleniumException e) {
// try again
try {
html = getHtmlViaSelenium(url);
} catch (SeleniumException e2) {
return "";
}
}
} else {
return "";
}
return html;
}
}

View File

@ -0,0 +1,28 @@
package eu.dnetlib.bioschemas.api.utils;
/**
* @author enrico.ottonello
*
*/
public class BioschemasException extends Exception{
public BioschemasException() {
}
public BioschemasException(final String message) {
super(message);
}
public BioschemasException(final String message, final Throwable cause) {
super(message, cause);
}
public BioschemasException(final Throwable cause) {
super(cause);
}
public BioschemasException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.bioschemas.api.utils;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class CompressorUtil {
public static String decompressValue(final String abstractCompressed) {
try {
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
final StringWriter stringWriter = new StringWriter();
IOUtils.copy(gis, stringWriter);
return stringWriter.toString();
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
public static String compressValue(final String value) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(value.getBytes());
gzip.close();
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
}
}

View File

@ -0,0 +1,64 @@
package eu.dnetlib.bioschemas.api.utils;
import hwu.elixir.utils.Helpers;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
public class UrlParser {
private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
Document doc = new Document(url);
Document urlSitemapListsNested;
Elements elements = new Elements();
Elements sitemaps = new Elements();
boolean sitemapindex = false;
boolean urlset = false;
try {
int urlLength = url.length();
logger.info("parse sitemap list");
String sitemapExt = url.substring(urlLength - 3, urlLength);
if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
logger.info("compressed sitemap");
byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
doc = Helpers.gzipFileDecompression(bytes);
} else {
doc = Jsoup.connect(url).maxBodySize(0).get();
}
} catch (IOException e) {
logger.error("Jsoup parsing exception: " + e.getMessage());
}
try {
elements = doc.select(sitemapURLKey);
// check the html if it is a sitemapindex or a urlset
sitemapindex = doc.outerHtml().contains("sitemapindex");
urlset = doc.outerHtml().contains("urlset");
} catch (NullPointerException e) {
logger.error(e.getMessage());
}
if (sitemapindex) {
// if sitemapindex get the loc of all the sitemaps
// added warning for sitemap index files
logger
.warn(
"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
sitemaps = doc.select(sitemapURLKey);
}
return elements;
}
}

View File

@ -0,0 +1,25 @@
server.servlet.context-path=/bioschemas
server.port=8281
spring.profiles.active=garr
logging.file.name = /var/log/bioschemas/log/bioschemas-api.log
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
spring.main.banner-mode = off
logging.level.root = INFO
management.endpoints.web.exposure.include = prometheus,health
management.endpoints.web.base-path = /
management.endpoints.web.path-mapping.prometheus = metrics
management.endpoints.web.path-mapping.health = health
waitTime=5
outputFolder=/data
outputDataPattern=_base64_gzipped_nquads.txt
numberOfPagesToCrawlInALoop=8
totalNumberOfPagesToCrawlInASession=32
chromiumDriverLocation = /usr/local/bin/chromedriver
scrapeVersion=1

View File

@ -18,6 +18,7 @@
<module>dnet-orgs-database-application</module> <module>dnet-orgs-database-application</module>
<module>dnet-exporter-api</module> <module>dnet-exporter-api</module>
<module>scholexplorer-api</module> <module>scholexplorer-api</module>
<module>bioschemas-api</module>
</modules> </modules>
<dependencies> <dependencies>

50
pom.xml
View File

@ -88,6 +88,18 @@
<name>Cloudera Repository</name> <name>Cloudera Repository</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url> <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
</repository> </repository>
<repository>
<id>dnet-deps</id>
<name>D-Net Dependencies</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
<layout>default</layout>
</repository>
</repositories> </repositories>
<dependencies> <dependencies>
@ -266,6 +278,43 @@
</exclusions> </exclusions>
</dependency> </dependency>
<!-- Bioschemas BMUSE -->
<dependency>
<groupId>hwu.elixir</groupId>
<artifactId>bmuse-core</artifactId>
<version>0.5.4</version>
</dependency>
<dependency>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-core</artifactId>
<version>2.3</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-rio-rdfxml</artifactId>
<version>3.7.1</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-model</artifactId>
<version>3.7.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.6</version>
</dependency>
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>
@ -406,5 +455,6 @@
<javamelody.version>1.71.0</javamelody.version> <javamelody.version>1.71.0</javamelody.version>
<maven.javadoc.failOnError>false</maven.javadoc.failOnError> <maven.javadoc.failOnError>false</maven.javadoc.failOnError>
<dockerfile-maven-version>1.3.6</dockerfile-maven-version> <dockerfile-maven-version>1.3.6</dockerfile-maven-version>
<bioschemas-commons-io-version>2.6</bioschemas-commons-io-version>
</properties> </properties>
</project> </project>