added app for bioschemas sources harvesting
This commit is contained in:
parent
19010a9624
commit
7375534764
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"type_source": "SVN",
|
||||||
|
"goal": "package -U source:jar",
|
||||||
|
"url": "http://svn-public.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk/",
|
||||||
|
"deploy_repository": "dnet5-snapshots",
|
||||||
|
"version": "5",
|
||||||
|
"mail": "sandro.labruzzo@isti.cnr.it,michele.artini@isti.cnr.it, claudio.atzori@isti.cnr.it, alessia.bardi@isti.cnr.it, enrico.ottonello@isti.cnr.it",
|
||||||
|
"deploy_repository_url": "http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots",
|
||||||
|
"name": "dnet-ariadneplus-graphdb-publisher"
|
||||||
|
}
|
|
@ -0,0 +1,86 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>apps</artifactId>
|
||||||
|
<version>3.2.8-SNAPSHOT</version>
|
||||||
|
<relativePath>../pom.xml</relativePath>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
<artifactId>bioschemas-api</artifactId>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter-test</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>hwu.elixir</groupId>
|
||||||
|
<artifactId>bmuse-core</artifactId>
|
||||||
|
<version>0.5.4</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.freemarker</groupId>
|
||||||
|
<artifactId>freemarker</artifactId>
|
||||||
|
<version>2.3.27-incubating</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.any23</groupId>
|
||||||
|
<artifactId>apache-any23-core</artifactId>
|
||||||
|
<version>2.3</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.eclipse.rdf4j</groupId>
|
||||||
|
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
||||||
|
<version>3.7.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.eclipse.rdf4j</groupId>
|
||||||
|
<artifactId>rdf4j-model</artifactId>
|
||||||
|
<version>3.7.1</version>
|
||||||
|
</dependency>
|
||||||
|
<!-- rdf 2.5.4 to 3.7.1-->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jsoup</groupId>
|
||||||
|
<artifactId>jsoup</artifactId>
|
||||||
|
<version>1.13.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.seleniumhq.selenium</groupId>
|
||||||
|
<artifactId>selenium-java</artifactId>
|
||||||
|
<version>3.141.59</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-io</groupId>
|
||||||
|
<artifactId>commons-io</artifactId>
|
||||||
|
<version>2.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
<version>1.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>ch.qos.logback</groupId>
|
||||||
|
<artifactId>logback-classic</artifactId>
|
||||||
|
<version>1.2.3</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-help-plugin</artifactId>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,173 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||||
|
<parent>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter-parent</artifactId>
|
||||||
|
<version>2.1.3.RELEASE</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
<artifactId>dnet-bmuse-webapp</artifactId>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
<version>1.0.0-SNAPSHOT</version>
|
||||||
|
<scm>
|
||||||
|
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk</developerConnection>
|
||||||
|
<url>https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp</url>
|
||||||
|
</scm>
|
||||||
|
<ciManagement>
|
||||||
|
<system>jenkins</system>
|
||||||
|
<url>https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/</url>
|
||||||
|
</ciManagement>
|
||||||
|
<distributionManagement>
|
||||||
|
<repository>
|
||||||
|
<id>dnet5-releases</id>
|
||||||
|
<name>D-Net 5 Releases</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
|
||||||
|
<layout>default</layout>
|
||||||
|
</repository>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
|
<!-- Inherit defaults from Spring Boot -->
|
||||||
|
|
||||||
|
<repositories>
|
||||||
|
<repository>
|
||||||
|
<id>dnet-deps</id>
|
||||||
|
<name>D-Net Dependencies</name>
|
||||||
|
<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url>
|
||||||
|
<releases>
|
||||||
|
<enabled>true</enabled>
|
||||||
|
</releases>
|
||||||
|
<snapshots>
|
||||||
|
<enabled>false</enabled>
|
||||||
|
</snapshots>
|
||||||
|
<layout>default</layout>
|
||||||
|
</repository>
|
||||||
|
<repository>
|
||||||
|
<id>dnet5-releases</id>
|
||||||
|
<name>D-Net 5 Releases</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
|
||||||
|
<layout>default</layout>
|
||||||
|
<snapshots>
|
||||||
|
<enabled>false</enabled>
|
||||||
|
</snapshots>
|
||||||
|
</repository>
|
||||||
|
<repository>
|
||||||
|
<id>dnet5-snapshots</id>
|
||||||
|
<name>D-Net 5 Snapshots</name>
|
||||||
|
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots</url>
|
||||||
|
<layout>default</layout>
|
||||||
|
<snapshots>
|
||||||
|
<enabled>true</enabled>
|
||||||
|
</snapshots>
|
||||||
|
</repository>
|
||||||
|
</repositories>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<version>4.13-rc-1</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter-test</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-autoconfigure</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-starter-web</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>hwu.elixir</groupId>
|
||||||
|
<artifactId>bmuse-core</artifactId>
|
||||||
|
<version>0.5.4</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.freemarker</groupId>
|
||||||
|
<artifactId>freemarker</artifactId>
|
||||||
|
<version>2.3.27-incubating</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.any23</groupId>
|
||||||
|
<artifactId>apache-any23-core</artifactId>
|
||||||
|
<version>2.3</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.eclipse.rdf4j</groupId>
|
||||||
|
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
||||||
|
<version>3.7.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.eclipse.rdf4j</groupId>
|
||||||
|
<artifactId>rdf4j-model</artifactId>
|
||||||
|
<version>3.7.1</version>
|
||||||
|
</dependency>
|
||||||
|
<!-- rdf 2.5.4 to 3.7.1-->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jsoup</groupId>
|
||||||
|
<artifactId>jsoup</artifactId>
|
||||||
|
<version>1.13.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.seleniumhq.selenium</groupId>
|
||||||
|
<artifactId>selenium-java</artifactId>
|
||||||
|
<version>3.141.59</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-io</groupId>
|
||||||
|
<artifactId>commons-io</artifactId>
|
||||||
|
<version>2.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
<version>1.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>ch.qos.logback</groupId>
|
||||||
|
<artifactId>logback-classic</artifactId>
|
||||||
|
<version>1.2.3</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.springframework.boot</groupId>
|
||||||
|
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<executable>true</executable>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<java.version>1.8</java.version>
|
||||||
|
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>java8-doclint-disabled</id>
|
||||||
|
<activation>
|
||||||
|
<jdk>[1.8,)</jdk>
|
||||||
|
</activation>
|
||||||
|
<properties>
|
||||||
|
<javadoc.opts>-Xdoclint:none</javadoc.opts>
|
||||||
|
</properties>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
</project>
|
|
@ -0,0 +1,45 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp;
|
||||||
|
|
||||||
|
import org.springframework.boot.web.client.RestTemplateBuilder;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
import org.springframework.context.annotation.Profile;
|
||||||
|
import org.springframework.web.client.RestTemplate;
|
||||||
|
|
||||||
|
import freemarker.cache.ClassTemplateLoader;
|
||||||
|
import freemarker.template.TemplateExceptionHandler;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author enrico.ottonello
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
@Profile("garr")
|
||||||
|
@Configuration
|
||||||
|
public class AppConfigGarr {
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
public RestTemplate jrrRestTemplate(){
|
||||||
|
//TODO: move configuration here from CatalogueRegistrator?
|
||||||
|
return new RestTemplateBuilder().build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
public freemarker.template.Configuration freemarkerConfig(){
|
||||||
|
freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27);
|
||||||
|
ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql");
|
||||||
|
config.setTemplateLoader(ctl);
|
||||||
|
config.setDefaultEncoding("UTF-8");
|
||||||
|
// Sets how errors will appear.
|
||||||
|
// During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better.
|
||||||
|
config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
|
||||||
|
|
||||||
|
// Don't log exceptions inside FreeMarker that it will thrown at you anyway:
|
||||||
|
config.setLogTemplateExceptions(false);
|
||||||
|
|
||||||
|
// Wrap unchecked exceptions thrown during template processing into TemplateException-s.
|
||||||
|
config.setWrapUncheckedExceptions(true);
|
||||||
|
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,42 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp;
|
||||||
|
|
||||||
|
import eu.dnetlib.common.app.AbstractDnetApp;
|
||||||
|
import org.springframework.boot.SpringApplication;
|
||||||
|
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
|
import org.springframework.cache.annotation.EnableCaching;
|
||||||
|
import org.springframework.context.annotation.ComponentScan;
|
||||||
|
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||||
|
import springfox.documentation.builders.ApiInfoBuilder;
|
||||||
|
import springfox.documentation.builders.RequestHandlerSelectors;
|
||||||
|
import springfox.documentation.service.ApiInfo;
|
||||||
|
import springfox.documentation.spring.web.plugins.Docket;
|
||||||
|
import springfox.documentation.swagger2.annotations.EnableSwagger2;
|
||||||
|
|
||||||
|
@SpringBootApplication
|
||||||
|
@EnableSwagger2
|
||||||
|
@EnableCaching
|
||||||
|
@EnableScheduling
|
||||||
|
@ComponentScan(basePackages = "eu.dnetlib")
|
||||||
|
public class MainApplication extends AbstractDnetApp {
|
||||||
|
|
||||||
|
public static void main(final String[] args) {
|
||||||
|
SpringApplication.run(MainApplication.class, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void configSwagger(final Docket docket) {
|
||||||
|
docket.select()
|
||||||
|
.apis(RequestHandlerSelectors.any())
|
||||||
|
.paths(p -> p.contains("/api/"))
|
||||||
|
.build()
|
||||||
|
.apiInfo(new ApiInfoBuilder()
|
||||||
|
.title("D-Net Bioschemas Service APIs")
|
||||||
|
.description("APIs documentation")
|
||||||
|
.version("1.1")
|
||||||
|
.contact(ApiInfo.DEFAULT_CONTACT)
|
||||||
|
.license("Apache 2.0")
|
||||||
|
.licenseUrl("http://www.apache.org/licenses/LICENSE-2.0")
|
||||||
|
.build());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,261 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp;
|
||||||
|
|
||||||
|
import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
|
||||||
|
import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper;
|
||||||
|
import eu.dnetlib.bmuse_webapp.scraper.ScrapeState;
|
||||||
|
import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread;
|
||||||
|
import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper;
|
||||||
|
import eu.dnetlib.bmuse_webapp.utils.UrlParser;
|
||||||
|
import hwu.elixir.utils.Helpers;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
|
||||||
|
* Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL)
|
||||||
|
* and adds provenance to the CrawlRecord.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ServiceScrapeDriver {
|
||||||
|
|
||||||
|
private static final String propertiesFile = "application.properties";
|
||||||
|
|
||||||
|
private int waitTime = 1;
|
||||||
|
private int numberOfPagesToCrawlInALoop;
|
||||||
|
private int totalNumberOfPagesToCrawlInASession;
|
||||||
|
private String outputFolder;
|
||||||
|
private int pagesCounter = 0;
|
||||||
|
private int scrapeVersion = 1;
|
||||||
|
|
||||||
|
private String sitemapUrl;
|
||||||
|
private String sitemapURLKey;
|
||||||
|
private String maxScrapedPages;
|
||||||
|
private String outputFilename;
|
||||||
|
|
||||||
|
private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
|
||||||
|
|
||||||
|
private static final Log logger = LogFactory.getLog(ServiceScrapeDriver.class);
|
||||||
|
|
||||||
|
public ServiceScrapeDriver(String sitemapUrl, String sitemapURLKey, String maxScrapedPages, String outputFilename) {
|
||||||
|
this.sitemapUrl = sitemapUrl;
|
||||||
|
this.sitemapURLKey = sitemapURLKey;
|
||||||
|
this.maxScrapedPages = maxScrapedPages;
|
||||||
|
this.outputFilename = outputFilename;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the scrape process
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public void start() throws IOException {
|
||||||
|
runScrape();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fires off threads
|
||||||
|
* Originally designed as a multi-threaded process; now reduced to a single thread as
|
||||||
|
* the selenium webdriver is too expensive to run multi-threaded. However, the threading
|
||||||
|
* as been left in situ in case it is useful in the future.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
private void runScrape() throws IOException {
|
||||||
|
processProperties();
|
||||||
|
String url = sitemapUrl.toLowerCase();
|
||||||
|
Elements urls = UrlParser.getSitemapList(getSitemapUrl(), getSitemapURLKey());
|
||||||
|
Stream<Element> urlStream = null;
|
||||||
|
if (Objects.nonNull(maxScrapedPages)) {
|
||||||
|
urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
|
||||||
|
} else {
|
||||||
|
urlStream = urls.stream();
|
||||||
|
}
|
||||||
|
List<Element> sites = urlStream.collect(Collectors.toList());
|
||||||
|
logger.info("Pages available for scraping: " + sites.size());
|
||||||
|
|
||||||
|
List<CrawlRecord> pagesToPull = generatePagesToPull(sites);
|
||||||
|
if (pagesToPull.isEmpty()) {
|
||||||
|
logger.error("Cannot retrieve URLs");
|
||||||
|
throw new RuntimeException("No pages found from sitemap");
|
||||||
|
}
|
||||||
|
|
||||||
|
ScrapeState scrapeState = new ScrapeState(pagesToPull);
|
||||||
|
|
||||||
|
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
||||||
|
while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
|
||||||
|
logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
|
||||||
|
|
||||||
|
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
|
||||||
|
scrape1.setName("S1");
|
||||||
|
|
||||||
|
// ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
||||||
|
// scrape2.setName("S2");
|
||||||
|
//
|
||||||
|
// ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
||||||
|
// scrape3.setName("S3");
|
||||||
|
//
|
||||||
|
// ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
||||||
|
// scrape4.setName("S4");
|
||||||
|
|
||||||
|
scrape1.start();
|
||||||
|
// scrape2.start();
|
||||||
|
// scrape3.start();
|
||||||
|
// scrape4.start();
|
||||||
|
long startTime = System.nanoTime();
|
||||||
|
|
||||||
|
try {
|
||||||
|
scrape1.join();
|
||||||
|
// scrape2.join();
|
||||||
|
// scrape3.join();
|
||||||
|
// scrape4.join();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
logger.error("Exception waiting on thread");
|
||||||
|
e.printStackTrace();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!scrape1.isFileWritten()) {
|
||||||
|
logger.error("Could not write output file so shutting down!");
|
||||||
|
Date date = new Date(System.currentTimeMillis());
|
||||||
|
logger.info("ENDING CRAWL after failure at: " + formatter.format(date));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
|
||||||
|
|
||||||
|
long endTime = System.nanoTime();
|
||||||
|
long timeElapsed = endTime - startTime;
|
||||||
|
logger.info("Time in s to complete: " + timeElapsed / 1e+9);
|
||||||
|
|
||||||
|
updateDatabase(scrapeState);
|
||||||
|
pagesCounter += numberOfPagesToCrawlInALoop;
|
||||||
|
|
||||||
|
|
||||||
|
logger.info("ENDED loop");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map<String, Object> nquads = scrapeState.getNquadsConcurrentHashMap();
|
||||||
|
// logger.info("Available nquads records: "+nquads.size() );
|
||||||
|
|
||||||
|
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
||||||
|
|
||||||
|
File output = new File(outputFolder.concat("/").concat(outputFilename));
|
||||||
|
if (output.exists()) {
|
||||||
|
output.delete();
|
||||||
|
output.createNewFile();
|
||||||
|
}
|
||||||
|
FileWriter fileWriter;
|
||||||
|
BufferedWriter bufferedWriter;
|
||||||
|
fileWriter = new FileWriter(output.getAbsoluteFile(), true); // true to append
|
||||||
|
bufferedWriter = new BufferedWriter(fileWriter);
|
||||||
|
|
||||||
|
List<CrawlRecord> processed = scrapeState.getPagesProcessed();
|
||||||
|
for (int i=0;i<processed.size();i++) {
|
||||||
|
try {
|
||||||
|
bufferedWriter.write(processed.get(i).getNquads());
|
||||||
|
bufferedWriter.newLine();
|
||||||
|
bufferedWriter.flush();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bufferedWriter.close();
|
||||||
|
logger.info(" dump to "+output.getAbsolutePath());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param scrapeState State of scrape at end
|
||||||
|
* @return true if success / false otherwise
|
||||||
|
* @see ScrapeState
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
private boolean updateDatabase(ScrapeState scrapeState) {
|
||||||
|
boolean result = false;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a list of URLs (in the form of CrawlRecords) that need to be scraped
|
||||||
|
*
|
||||||
|
* @return List of URLs to be scraped
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
private List<CrawlRecord> generatePagesToPull(List<Element> sites) {
|
||||||
|
List<CrawlRecord> crawls = sites
|
||||||
|
.stream()
|
||||||
|
.map(s -> {
|
||||||
|
CrawlRecord crawlRecord = new CrawlRecord(s.text());
|
||||||
|
String[] urlSplitted = crawlRecord.getUrl().split("/");
|
||||||
|
String name = urlSplitted[urlSplitted.length - 1];
|
||||||
|
crawlRecord.setName(name);
|
||||||
|
return crawlRecord;
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
return crawls;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates properties based on properties file in src > main > resources
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
private void processProperties() {
|
||||||
|
ClassLoader classLoader = ServiceScrapeDriver.class.getClassLoader();
|
||||||
|
|
||||||
|
InputStream is = classLoader.getResourceAsStream(propertiesFile);
|
||||||
|
if(is == null) {
|
||||||
|
logger.error(" Cannot find " + propertiesFile + " file");
|
||||||
|
throw new IllegalArgumentException(propertiesFile + "file is not found!");
|
||||||
|
}
|
||||||
|
|
||||||
|
Properties prop = new Properties();
|
||||||
|
|
||||||
|
try {
|
||||||
|
prop.load(is);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error(" Cannot load application.properties", e);
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
waitTime = Integer.parseInt(prop.getProperty("waitTime").trim());
|
||||||
|
logger.info(" waitTime: " + waitTime);
|
||||||
|
outputFolder = prop.getProperty("outputFolder").trim();
|
||||||
|
logger.info(" outputFolder: " + outputFolder);
|
||||||
|
numberOfPagesToCrawlInALoop = Integer.parseInt(prop.getProperty("numberOfPagesToCrawlInALoop").trim());
|
||||||
|
logger.info(" numberOfPagesToCrawl: " + numberOfPagesToCrawlInALoop);
|
||||||
|
totalNumberOfPagesToCrawlInASession = Integer.parseInt(prop.getProperty("totalNumberOfPagesToCrawlInASession").trim());
|
||||||
|
logger.info(" totalNumberOfPagesToCrawlInASession: " + totalNumberOfPagesToCrawlInASession);
|
||||||
|
scrapeVersion = Integer.parseInt(prop.getProperty("scrapeVersion").trim());
|
||||||
|
logger.info(" scrapeVersion: " + scrapeVersion);
|
||||||
|
logger.info("\n\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSitemapUrl() {
|
||||||
|
return sitemapUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSitemapURLKey() {
|
||||||
|
return sitemapURLKey;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getId(String pageUrl) {
|
||||||
|
String[] parts = pageUrl.split("/");
|
||||||
|
return parts[parts.length - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,136 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.crawl;
|
||||||
|
|
||||||
|
import java.util.Date;
|
||||||
|
|
||||||
|
import hwu.elixir.utils.Validation;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* Store the current status of a single URL in the scrape service.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
public class CrawlRecord {
|
||||||
|
|
||||||
|
private Long id;
|
||||||
|
|
||||||
|
private String context = "";
|
||||||
|
|
||||||
|
private String url;
|
||||||
|
|
||||||
|
private Date dateScraped;
|
||||||
|
|
||||||
|
private StatusOfScrape status;
|
||||||
|
|
||||||
|
private boolean beingScraped;
|
||||||
|
|
||||||
|
private String name;
|
||||||
|
|
||||||
|
private String nquads;
|
||||||
|
|
||||||
|
public CrawlRecord() {
|
||||||
|
status = StatusOfScrape.UNTRIED;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CrawlRecord(String url) {
|
||||||
|
Validation validation = new Validation();
|
||||||
|
if(validation.validateURI(url)) {
|
||||||
|
this.url = url;
|
||||||
|
context = "";
|
||||||
|
status = StatusOfScrape.UNTRIED;
|
||||||
|
dateScraped = null;
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException(url +" is not a valid url");
|
||||||
|
}
|
||||||
|
this.setId(System.currentTimeMillis());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(Long id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getUrl() {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Date getDateScraped() {
|
||||||
|
return dateScraped;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDateScraped(Date dateScraped) {
|
||||||
|
this.dateScraped = dateScraped;
|
||||||
|
}
|
||||||
|
|
||||||
|
public StatusOfScrape getStatus() {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStatus(StatusOfScrape status) {
|
||||||
|
this.status = status;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getContext() {
|
||||||
|
return context;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setContext(String context) {
|
||||||
|
this.context = context;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBeingScraped() {
|
||||||
|
return beingScraped;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBeingScraped(boolean beingScraped) {
|
||||||
|
this.beingScraped = beingScraped;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setName(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getNquads() {
|
||||||
|
return nquads;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNquads(String nquads) {
|
||||||
|
this.nquads = nquads;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o)
|
||||||
|
return true;
|
||||||
|
if (!(o instanceof CrawlRecord))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
CrawlRecord otherCrawl = (CrawlRecord) o;
|
||||||
|
|
||||||
|
if(this.url.equals(otherCrawl.getUrl())) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int result = getId() != null ? getId().hashCode() : 0;
|
||||||
|
result = 31 * result + (getUrl() != null ? getUrl().hashCode() : 0);
|
||||||
|
result = 31 * result + (getContext() != null ? getContext().hashCode() : 0);
|
||||||
|
result = 31 * result + (getDateScraped() != null ? getDateScraped().hashCode() : 0);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.crawl;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* {@link eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape} describes the possible status levels the scrape for each URL/CrawlRecord.
|
||||||
|
*
|
||||||
|
* Each URL/CrawlRecord can have one of the following:
|
||||||
|
* DOES_NOT_EXIST = 404.
|
||||||
|
* HUMAN_INSPECTION = cannot parse for some reason; a human should see what is happening.
|
||||||
|
* UNTRIED = not scraped yet.
|
||||||
|
* FAILED = one failed attempt at scraping; will try again.
|
||||||
|
* GIVEN_UP = two failed attempts at scraping. Will not try again.
|
||||||
|
* SUCCESS = successfully scraped.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
public enum StatusOfScrape {
|
||||||
|
DOES_NOT_EXIST, HUMAN_INSPECTION, UNTRIED, FAILED, GIVEN_UP, SUCCESS;
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.publisher;
|
||||||
|
|
||||||
|
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
|
||||||
|
import eu.dnetlib.common.controller.AbstractDnetController;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.io.LineIterator;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.tomcat.jni.FileInfo;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.web.bind.annotation.*;
|
||||||
|
|
||||||
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author enrico.ottonello
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/api")
|
||||||
|
public class BMUSEWebappController extends AbstractDnetController {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(BMUSEWebappController.class);
|
||||||
|
|
||||||
|
@RequestMapping(value = "/version", method = RequestMethod.GET)
|
||||||
|
public String version() throws BMUSEWebappException {
|
||||||
|
return "1.0.0-SNAPSHOT";
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/scrape", method = RequestMethod.GET)
|
||||||
|
public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
|
||||||
|
|
||||||
|
log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
||||||
|
// String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
|
||||||
|
// String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
|
||||||
|
// String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
|
||||||
|
String sitemapUrlKey = "loc";
|
||||||
|
String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt");
|
||||||
|
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
||||||
|
service.start();
|
||||||
|
return "started";
|
||||||
|
}
|
||||||
|
|
||||||
|
@RequestMapping(value = "/nquads", method = RequestMethod.GET)
|
||||||
|
public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException {
|
||||||
|
LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8");
|
||||||
|
try {
|
||||||
|
while (it.hasNext()) {
|
||||||
|
String line = it.nextLine();
|
||||||
|
response.getOutputStream().write(line.getBytes(StandardCharsets.UTF_8));
|
||||||
|
response.getOutputStream().println();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.publisher;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author enrico.ottonello
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class BMUSEWebappException extends Exception{
|
||||||
|
|
||||||
|
public BMUSEWebappException() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public BMUSEWebappException(final String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BMUSEWebappException(final String message, final Throwable cause) {
|
||||||
|
super(message, cause);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BMUSEWebappException(final Throwable cause) {
|
||||||
|
super(cause);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BMUSEWebappException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) {
|
||||||
|
super(message, cause, enableSuppression, writableStackTrace);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.publisher;
|
||||||
|
|
||||||
|
import eu.dnetlib.common.controller.AbstractDnetController;
|
||||||
|
import org.springframework.stereotype.Controller;
|
||||||
|
import org.springframework.web.bind.annotation.GetMapping;
|
||||||
|
|
||||||
|
@Controller
|
||||||
|
public class HomeController extends AbstractDnetController {
|
||||||
|
|
||||||
|
@GetMapping({
|
||||||
|
"/doc", "/swagger"
|
||||||
|
})
|
||||||
|
public String apiDoc() {
|
||||||
|
return "redirect:swagger-ui/";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,90 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.bmuse_webapp.scraper;
|
||||||
|
|
||||||
|
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
|
||||||
|
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
||||||
|
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
||||||
|
import org.apache.any23.Any23;
|
||||||
|
import org.apache.any23.extractor.ExtractionException;
|
||||||
|
import org.apache.any23.source.DocumentSource;
|
||||||
|
import org.apache.any23.source.StringDocumentSource;
|
||||||
|
import org.apache.any23.writer.NTriplesWriter;
|
||||||
|
import org.apache.any23.writer.TripleHandler;
|
||||||
|
import org.apache.any23.writer.TripleHandlerException;
|
||||||
|
import org.apache.commons.io.output.ByteArrayOutputStream;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.eclipse.rdf4j.model.IRI;
|
||||||
|
import org.eclipse.rdf4j.model.Model;
|
||||||
|
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
|
||||||
|
import org.eclipse.rdf4j.rio.RDFFormat;
|
||||||
|
import org.eclipse.rdf4j.rio.Rio;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
|
||||||
|
public class BMUSEScraper extends ScraperFilteredCore {
|
||||||
|
|
||||||
|
private static final Log logger = LogFactory.getLog(BMUSEScraper.class);
|
||||||
|
|
||||||
|
public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
|
||||||
|
logger.debug(url + " > scraping");
|
||||||
|
url = fixURL(url);
|
||||||
|
|
||||||
|
String html = "";
|
||||||
|
// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
|
||||||
|
// (dynamic and static respectively)
|
||||||
|
|
||||||
|
if (dynamic) {
|
||||||
|
html = wrapHTMLExtraction(url);
|
||||||
|
} else {
|
||||||
|
html = wrapHTMLExtractionStatic(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (html == null || html.contentEquals(""))
|
||||||
|
throw new Exception("empty html");
|
||||||
|
|
||||||
|
html = injectId(html, url);
|
||||||
|
|
||||||
|
logger.debug(url + " > html scraped from " + url);
|
||||||
|
DocumentSource source = new StringDocumentSource(html, url);
|
||||||
|
String n3 = html2Triples(source, url);
|
||||||
|
if (n3 == null) {
|
||||||
|
throw new MissingMarkupException(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug(url + " > processing triples");
|
||||||
|
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
|
||||||
|
Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
|
||||||
|
if (updatedModel == null) {
|
||||||
|
throw new Exception("rdf model null");
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug(url + " > generating nquads");
|
||||||
|
try (StringWriter jsonLDWriter = new StringWriter()) {
|
||||||
|
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
|
||||||
|
logger.debug(url + " > nquads generated");
|
||||||
|
return jsonLDWriter.toString();
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String html2Triples(DocumentSource source, String url) throws Exception {
|
||||||
|
Any23 runner = new Any23();
|
||||||
|
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||||
|
TripleHandler handler = new NTriplesWriter(out);) {
|
||||||
|
runner.extract(source, handler);
|
||||||
|
return out.toString("UTF-8");
|
||||||
|
} catch (ExtractionException e) {
|
||||||
|
logger.error("Cannot extract triples", e);
|
||||||
|
} catch (IOException e1) {
|
||||||
|
logger.error(" IO error whilst extracting triples", e1);
|
||||||
|
} catch (TripleHandlerException e2) {
|
||||||
|
logger.error("TripleHanderException", e2);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,157 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.scraper;
|
||||||
|
|
||||||
|
import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
|
||||||
|
import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
|
||||||
|
*/
|
||||||
|
public class ScrapeState {
|
||||||
|
|
||||||
|
private List<CrawlRecord> urlsToScrape = Collections.synchronizedList(new ArrayList<CrawlRecord>());
|
||||||
|
private List<CrawlRecord> urlsProcessed = Collections.synchronizedList(new ArrayList<CrawlRecord>()); // should this be a set?
|
||||||
|
private Map<String, Object> nquadsConcurrentHashMap = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param pagesToBeScraped The list of sites to be scraped
|
||||||
|
* @see ScrapeThread
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public ScrapeState(List<CrawlRecord> pagesToBeScraped) {
|
||||||
|
urlsToScrape.addAll(pagesToBeScraped);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Any pages/URLs left to scrape?
|
||||||
|
* @return True for yes & false for no
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized boolean pagesLeftToScrape() {
|
||||||
|
return !urlsToScrape.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the next URL/CrawlRecord to be scraped
|
||||||
|
*
|
||||||
|
* @return First page/URL that needs to be scraped next
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized CrawlRecord getURLToProcess() {
|
||||||
|
if (urlsToScrape.isEmpty())
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return urlsToScrape.remove(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds the given CrawlRecord to the list of CrawlRecords successfully scraped.
|
||||||
|
* Updates the status of the CrawlRecord to SUCCESS.
|
||||||
|
*
|
||||||
|
* @param url The latest URL/page that has been successfully scraped
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized void addSuccessfulScrapedURL(CrawlRecord record) {
|
||||||
|
record.setStatus(StatusOfScrape.SUCCESS);
|
||||||
|
urlsProcessed.add(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds the given CrawlRecord to the list of CrawlRecords NOT successfully scraped.
|
||||||
|
* Updates the status of the CrawlRecord; if first failure the status is FAILED.
|
||||||
|
* If status is already FAILED it is changed to GIVEN_UP.
|
||||||
|
*
|
||||||
|
* If the status is FAILED, another try will be made in a future run.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @param url The latest URL/page that has been unsuccessfully scraped
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized void addFailedToScrapeURL(CrawlRecord record) {
|
||||||
|
if (record.getStatus().equals(StatusOfScrape.FAILED)) {
|
||||||
|
record.setStatus(StatusOfScrape.GIVEN_UP);
|
||||||
|
} else {
|
||||||
|
record.setStatus(StatusOfScrape.FAILED);
|
||||||
|
}
|
||||||
|
urlsProcessed.add(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Changes the status of the CrawlRecord to DOES_NOT_EXIST.
|
||||||
|
* As Selenium does not return the HTTP codes, it is questionable
|
||||||
|
* how useful this is.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @param url The latest URL/page that has been 404'd
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized void setStatusTo404(CrawlRecord record) {
|
||||||
|
record.setStatus(StatusOfScrape.DOES_NOT_EXIST);
|
||||||
|
urlsProcessed.add(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* Changes the status of the CrawlRecord to HUMAN_INSPECTION.
|
||||||
|
* This captures the idea that the URLs may contain unexpected markup that needs a human to
|
||||||
|
* review and possibly update the scraper.
|
||||||
|
*
|
||||||
|
* @param url The latest URL/page that needs human inspection
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized void setStatusToHumanInspection(CrawlRecord record) {
|
||||||
|
record.setStatus(StatusOfScrape.HUMAN_INSPECTION);
|
||||||
|
urlsProcessed.add(record);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of URLs that are still to be scraped in this cycle.
|
||||||
|
* This does not return the number of URLs left to scrape in the DBMS, just in the current cycle.
|
||||||
|
*
|
||||||
|
* @return Number of URLs left to scrape in this cycle
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized int getNumberPagesLeftToScrape() {
|
||||||
|
return urlsToScrape.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the full list of URLs that have been processed in this cycle.
|
||||||
|
* This does not return the number of URLs that have been scraped in total across all cycles.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized List<CrawlRecord> getPagesProcessed() {
|
||||||
|
return urlsProcessed;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the full list of URLs/CrawlRecords regardless of whether scraped or not in the current cycle.
|
||||||
|
*
|
||||||
|
* @return List of all CrawlRecords in this cycle.
|
||||||
|
* @see CrawlRecord
|
||||||
|
*/
|
||||||
|
public synchronized List<CrawlRecord> getPagesProcessedAndUnprocessed() {
|
||||||
|
List<CrawlRecord> urlsCombined = Collections.synchronizedList(new ArrayList<CrawlRecord>());
|
||||||
|
urlsCombined.addAll(urlsProcessed);
|
||||||
|
urlsCombined.addAll(urlsToScrape);
|
||||||
|
return urlsCombined;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addNquads(String key, String nquads) {
|
||||||
|
nquadsConcurrentHashMap.putIfAbsent(key, nquads);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Object> getNquadsConcurrentHashMap() {
|
||||||
|
return nquadsConcurrentHashMap;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,109 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.scraper;
|
||||||
|
|
||||||
|
import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
|
||||||
|
import eu.dnetlib.bmuse_webapp.utils.CompressorUtil;
|
||||||
|
import hwu.elixir.scrape.exceptions.CannotWriteException;
|
||||||
|
import hwu.elixir.scrape.exceptions.FourZeroFourException;
|
||||||
|
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
|
||||||
|
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
//import org.apache.commons.logging.Log;
|
||||||
|
//import org.apache.commons.logging.LogFactory;
|
||||||
|
//import org.slf4j.Logger;
|
||||||
|
//import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Date;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @see BMUSEScraper
|
||||||
|
* @see ScrapeState
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ScrapeThread extends Thread {
|
||||||
|
private ScrapeState scrapeState;
|
||||||
|
private BMUSEScraper process;
|
||||||
|
private int waitTime;
|
||||||
|
private boolean fileWritten = true;
|
||||||
|
private int scrapeVersion = 1;
|
||||||
|
|
||||||
|
private static final Log logger = LogFactory.getLog(ScrapeThread.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets up a thread for actually scrapping.
|
||||||
|
*
|
||||||
|
* @param scraper Scraper that will actually do the scraping.
|
||||||
|
* @param scrapeState Object that maintains state across threads.
|
||||||
|
* @param waitTime How long (in seconds) thread should wait after scraping
|
||||||
|
* page before attempting new page.
|
||||||
|
* @param contextVersion The context URL used is 'https://bioschemas.org/crawl/CONTEXTVERSION/ID' Where ID is the id of the CrawlRecord pulled.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public ScrapeThread(BMUSEScraper scraper, ScrapeState scrapeState, int waitTime, int contextVersion) {
|
||||||
|
this.scrapeState = scrapeState;
|
||||||
|
process = scraper;
|
||||||
|
this.waitTime = waitTime;
|
||||||
|
this.scrapeVersion = contextVersion;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
/**
|
||||||
|
* Defines high-level process of scraping. Actual scraping done by an
|
||||||
|
* implementation of Scraper. If page scrape successful will add url to
|
||||||
|
* Scrape.sitesScraped
|
||||||
|
*
|
||||||
|
* @see Scraper
|
||||||
|
* @see SimpleScraper
|
||||||
|
*/
|
||||||
|
public void run() {
|
||||||
|
while (scrapeState.pagesLeftToScrape()) {
|
||||||
|
CrawlRecord record = scrapeState.getURLToProcess();
|
||||||
|
|
||||||
|
if (record == null)
|
||||||
|
break;
|
||||||
|
|
||||||
|
record.setContext("https://bioschemas.org/crawl/" + scrapeVersion +"/" + record.getId());
|
||||||
|
record.setDateScraped(new Date());
|
||||||
|
|
||||||
|
try {
|
||||||
|
String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
|
||||||
|
// scrapeState.addNquads(record.getName(), nquads);
|
||||||
|
logger.info("downloaded "+record.getUrl());
|
||||||
|
record.setNquads(CompressorUtil.compressValue(nquads));
|
||||||
|
if (!nquads.isEmpty()) {
|
||||||
|
scrapeState.addSuccessfulScrapedURL(record);
|
||||||
|
} else {
|
||||||
|
scrapeState.addFailedToScrapeURL(record);
|
||||||
|
}
|
||||||
|
} catch(FourZeroFourException fourZeroFourException) {
|
||||||
|
scrapeState.setStatusTo404(record);
|
||||||
|
fileWritten = false;
|
||||||
|
} catch (JsonLDInspectionException je) {
|
||||||
|
scrapeState.setStatusToHumanInspection(record);
|
||||||
|
fileWritten = false;
|
||||||
|
} catch (CannotWriteException cannotWrite) {
|
||||||
|
logger.error("Caught cannot read file, setting worked to false!");
|
||||||
|
fileWritten = false;
|
||||||
|
scrapeState.addFailedToScrapeURL(record);
|
||||||
|
return; // no point in continuing
|
||||||
|
} catch (MissingMarkupException e) {
|
||||||
|
logger.error("Cannot obtain markup from " + record.getUrl() +".");
|
||||||
|
fileWritten = false;
|
||||||
|
scrapeState.addFailedToScrapeURL(record);
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
ScrapeThread.sleep(100 * waitTime);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
process.shutdown();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isFileWritten() {
|
||||||
|
return fileWritten;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,72 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.scraper;
|
||||||
|
|
||||||
|
import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
|
||||||
|
import hwu.elixir.scrape.exceptions.*;
|
||||||
|
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
||||||
|
import org.apache.commons.lang.time.DateUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides the
|
||||||
|
* actual scraping functionality.
|
||||||
|
*
|
||||||
|
* Scrapes a given URL, converts into NQuads and writes to a file (name derived
|
||||||
|
* from URL). If the file already exists it will be overwritten.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @see ScraperFilteredCore
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ServiceScraper extends ScraperFilteredCore {
|
||||||
|
|
||||||
|
private static Logger logger = LoggerFactory.getLogger(System.class.getName());
|
||||||
|
|
||||||
|
private StatusOfScrape status= null;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Orchestrates the process of scraping a site before converting the extracted
|
||||||
|
* triples to NQuads and writing to a file.
|
||||||
|
*
|
||||||
|
* @param url Site to be scraped
|
||||||
|
* @param contextCounter Number used to generate the named graph/context and
|
||||||
|
* the URLs used to replace blank nodes.
|
||||||
|
* @param outputFolderName Location to which the NQuads will be written
|
||||||
|
* @return True if success; false otherwise
|
||||||
|
* @throws FourZeroFourException
|
||||||
|
* @throws JsonLDInspectionException
|
||||||
|
* @throws CannotWriteException
|
||||||
|
* @throws MissingMarkupException
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public boolean scrape(String url, Long contextCounter, String outputFolderName, String fileName, StatusOfScrape status) throws FourZeroFourException, JsonLDInspectionException, CannotWriteException, MissingMarkupException {
|
||||||
|
this.status = status;
|
||||||
|
logger.info("scraping "+url + " to "+fileName);
|
||||||
|
return scrape(url, outputFolderName, fileName, contextCounter, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
/* Now takes account of StateOfCrawl
|
||||||
|
*/
|
||||||
|
protected String wrapHTMLExtraction(String url) throws FourZeroFourException {
|
||||||
|
String html = "";
|
||||||
|
if (status.equals(StatusOfScrape.UNTRIED) || status.equals(StatusOfScrape.FAILED)) {
|
||||||
|
try {
|
||||||
|
html = getHtmlViaSelenium(url);
|
||||||
|
} catch (SeleniumException e) {
|
||||||
|
// try again
|
||||||
|
try {
|
||||||
|
html = getHtmlViaSelenium(url);
|
||||||
|
} catch (SeleniumException e2) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
package eu.dnetlib.bmuse_webapp.utils;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.binary.Base64;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
|
||||||
|
public class CompressorUtil {
|
||||||
|
|
||||||
|
public static String decompressValue(final String abstractCompressed) {
|
||||||
|
try {
|
||||||
|
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
||||||
|
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
|
||||||
|
final StringWriter stringWriter = new StringWriter();
|
||||||
|
IOUtils.copy(gis, stringWriter);
|
||||||
|
return stringWriter.toString();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new IllegalArgumentException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String compressValue(final String value) throws IOException {
|
||||||
|
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||||
|
GZIPOutputStream gzip = new GZIPOutputStream(out);
|
||||||
|
gzip.write(value.getBytes());
|
||||||
|
gzip.close();
|
||||||
|
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.bmuse_webapp.utils;
|
||||||
|
|
||||||
|
import hwu.elixir.utils.Helpers;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class UrlParser {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
|
||||||
|
|
||||||
|
public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
|
||||||
|
|
||||||
|
Document doc = new Document(url);
|
||||||
|
Document urlSitemapListsNested;
|
||||||
|
Elements elements = new Elements();
|
||||||
|
Elements sitemaps = new Elements();
|
||||||
|
boolean sitemapindex = false;
|
||||||
|
boolean urlset = false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
int urlLength = url.length();
|
||||||
|
logger.info("parse sitemap list");
|
||||||
|
String sitemapExt = url.substring(urlLength - 3, urlLength);
|
||||||
|
if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
|
||||||
|
logger.info("compressed sitemap");
|
||||||
|
byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
|
||||||
|
doc = Helpers.gzipFileDecompression(bytes);
|
||||||
|
} else {
|
||||||
|
doc = Jsoup.connect(url).maxBodySize(0).get();
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("Jsoup parsing exception: " + e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
elements = doc.select(sitemapURLKey);
|
||||||
|
|
||||||
|
// check the html if it is a sitemapindex or a urlset
|
||||||
|
sitemapindex = doc.outerHtml().contains("sitemapindex");
|
||||||
|
urlset = doc.outerHtml().contains("urlset");
|
||||||
|
} catch (NullPointerException e) {
|
||||||
|
logger.error(e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sitemapindex) {
|
||||||
|
// if sitemapindex get the loc of all the sitemaps
|
||||||
|
// added warning for sitemap index files
|
||||||
|
logger
|
||||||
|
.warn(
|
||||||
|
"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
|
||||||
|
sitemaps = doc.select(sitemapURLKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
return elements;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,24 @@
|
||||||
|
server.servlet.context-path=/dnet-bmuse-webapp
|
||||||
|
server.port=8281
|
||||||
|
|
||||||
|
spring.profiles.active=garr
|
||||||
|
|
||||||
|
logging.file.name = /var/log/springboot/9480/oa_organizations.log
|
||||||
|
|
||||||
|
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
|
||||||
|
|
||||||
|
spring.main.banner-mode = off
|
||||||
|
|
||||||
|
logging.level.root = INFO
|
||||||
|
|
||||||
|
management.endpoints.web.exposure.include = prometheus,health
|
||||||
|
management.endpoints.web.base-path = /
|
||||||
|
management.endpoints.web.path-mapping.prometheus = metrics
|
||||||
|
management.endpoints.web.path-mapping.health = health
|
||||||
|
|
||||||
|
waitTime=5
|
||||||
|
outputFolder=/Users/enrico.ottonello/data/bmuse-output
|
||||||
|
numberOfPagesToCrawlInALoop=8
|
||||||
|
totalNumberOfPagesToCrawlInASession=32
|
||||||
|
chromiumDriverLocation = /usr/local/bin/chromedriver
|
||||||
|
scrapeVersion=1
|
|
@ -0,0 +1,30 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<configuration>
|
||||||
|
<appender name="SAVE-TO-FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
||||||
|
<file>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log</file>
|
||||||
|
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
|
||||||
|
<Pattern>%d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n</Pattern>
|
||||||
|
</encoder>
|
||||||
|
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
|
||||||
|
<fileNamePattern>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log</fileNamePattern>
|
||||||
|
<maxFileSize>10MB</maxFileSize>
|
||||||
|
<maxHistory>10</maxHistory>
|
||||||
|
<totalSizeCap>100MB</totalSizeCap>
|
||||||
|
</rollingPolicy>
|
||||||
|
</appender>
|
||||||
|
<logger name="org.springframework" level="INFO" additivity="false">
|
||||||
|
<appender-ref ref="SAVE-TO-FILE" />
|
||||||
|
</logger>
|
||||||
|
<logger name="root" level="INFO" additivity="false">
|
||||||
|
<appender-ref ref="SAVE-TO-FILE" />
|
||||||
|
</logger>
|
||||||
|
<logger name="eu.dnetlib" level="INFO" additivity="false">
|
||||||
|
<appender-ref ref="SAVE-TO-FILE" />
|
||||||
|
</logger>
|
||||||
|
<logger name="eu.dnetlib.bmuse_webapp" level="INFO" additivity="false">
|
||||||
|
<appender-ref ref="SAVE-TO-FILE" />
|
||||||
|
</logger>
|
||||||
|
<logger name="hwu.elixir" level="INFO" additivity="false">
|
||||||
|
<appender-ref ref="SAVE-TO-FILE" />
|
||||||
|
</logger>
|
||||||
|
</configuration>
|
|
@ -18,6 +18,7 @@
|
||||||
<module>dnet-orgs-database-application</module>
|
<module>dnet-orgs-database-application</module>
|
||||||
<module>dnet-exporter-api</module>
|
<module>dnet-exporter-api</module>
|
||||||
<module>scholexplorer-api</module>
|
<module>scholexplorer-api</module>
|
||||||
|
<module>bioschemas-api</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|
12
pom.xml
12
pom.xml
|
@ -88,6 +88,18 @@
|
||||||
<name>Cloudera Repository</name>
|
<name>Cloudera Repository</name>
|
||||||
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
|
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
|
||||||
</repository>
|
</repository>
|
||||||
|
<repository>
|
||||||
|
<id>dnet-deps</id>
|
||||||
|
<name>D-Net Dependencies</name>
|
||||||
|
<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url>
|
||||||
|
<releases>
|
||||||
|
<enabled>true</enabled>
|
||||||
|
</releases>
|
||||||
|
<snapshots>
|
||||||
|
<enabled>false</enabled>
|
||||||
|
</snapshots>
|
||||||
|
<layout>default</layout>
|
||||||
|
</repository>
|
||||||
</repositories>
|
</repositories>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|
Loading…
Reference in New Issue