fixed dependencies declaration, configuration properties
This commit is contained in:
parent
7375534764
commit
079b2506e6
|
@ -12,62 +12,38 @@
|
||||||
<artifactId>bioschemas-api</artifactId>
|
<artifactId>bioschemas-api</artifactId>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot-starter-test</artifactId>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>hwu.elixir</groupId>
|
<groupId>hwu.elixir</groupId>
|
||||||
<artifactId>bmuse-core</artifactId>
|
<artifactId>bmuse-core</artifactId>
|
||||||
<version>0.5.4</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>org.freemarker</groupId>
|
|
||||||
<artifactId>freemarker</artifactId>
|
|
||||||
<version>2.3.27-incubating</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.any23</groupId>
|
<groupId>org.apache.any23</groupId>
|
||||||
<artifactId>apache-any23-core</artifactId>
|
<artifactId>apache-any23-core</artifactId>
|
||||||
<version>2.3</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.eclipse.rdf4j</groupId>
|
<groupId>org.eclipse.rdf4j</groupId>
|
||||||
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
||||||
<version>3.7.1</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.eclipse.rdf4j</groupId>
|
<groupId>org.eclipse.rdf4j</groupId>
|
||||||
<artifactId>rdf4j-model</artifactId>
|
<artifactId>rdf4j-model</artifactId>
|
||||||
<version>3.7.1</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<!-- rdf 2.5.4 to 3.7.1-->
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.jsoup</groupId>
|
<groupId>org.jsoup</groupId>
|
||||||
<artifactId>jsoup</artifactId>
|
<artifactId>jsoup</artifactId>
|
||||||
<version>1.13.1</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.seleniumhq.selenium</groupId>
|
<groupId>org.seleniumhq.selenium</groupId>
|
||||||
<artifactId>selenium-java</artifactId>
|
<artifactId>selenium-java</artifactId>
|
||||||
<version>3.141.59</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-io</groupId>
|
<groupId>commons-io</groupId>
|
||||||
<artifactId>commons-io</artifactId>
|
<artifactId>commons-io</artifactId>
|
||||||
<version>2.6</version>
|
<version>${bioschemas-commons-io-version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-validator</groupId>
|
<groupId>commons-validator</groupId>
|
||||||
<artifactId>commons-validator</artifactId>
|
<artifactId>commons-validator</artifactId>
|
||||||
<version>1.6</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>ch.qos.logback</groupId>
|
|
||||||
<artifactId>logback-classic</artifactId>
|
|
||||||
<version>1.2.3</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -1,173 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
|
||||||
<parent>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot-starter-parent</artifactId>
|
|
||||||
<version>2.1.3.RELEASE</version>
|
|
||||||
</parent>
|
|
||||||
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
<groupId>eu.dnetlib</groupId>
|
|
||||||
<artifactId>dnet-bmuse-webapp</artifactId>
|
|
||||||
<packaging>jar</packaging>
|
|
||||||
<version>1.0.0-SNAPSHOT</version>
|
|
||||||
<scm>
|
|
||||||
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk</developerConnection>
|
|
||||||
<url>https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp</url>
|
|
||||||
</scm>
|
|
||||||
<ciManagement>
|
|
||||||
<system>jenkins</system>
|
|
||||||
<url>https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/</url>
|
|
||||||
</ciManagement>
|
|
||||||
<distributionManagement>
|
|
||||||
<repository>
|
|
||||||
<id>dnet5-releases</id>
|
|
||||||
<name>D-Net 5 Releases</name>
|
|
||||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
|
|
||||||
<layout>default</layout>
|
|
||||||
</repository>
|
|
||||||
</distributionManagement>
|
|
||||||
|
|
||||||
<!-- Inherit defaults from Spring Boot -->
|
|
||||||
|
|
||||||
<repositories>
|
|
||||||
<repository>
|
|
||||||
<id>dnet-deps</id>
|
|
||||||
<name>D-Net Dependencies</name>
|
|
||||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url>
|
|
||||||
<releases>
|
|
||||||
<enabled>true</enabled>
|
|
||||||
</releases>
|
|
||||||
<snapshots>
|
|
||||||
<enabled>false</enabled>
|
|
||||||
</snapshots>
|
|
||||||
<layout>default</layout>
|
|
||||||
</repository>
|
|
||||||
<repository>
|
|
||||||
<id>dnet5-releases</id>
|
|
||||||
<name>D-Net 5 Releases</name>
|
|
||||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
|
|
||||||
<layout>default</layout>
|
|
||||||
<snapshots>
|
|
||||||
<enabled>false</enabled>
|
|
||||||
</snapshots>
|
|
||||||
</repository>
|
|
||||||
<repository>
|
|
||||||
<id>dnet5-snapshots</id>
|
|
||||||
<name>D-Net 5 Snapshots</name>
|
|
||||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots</url>
|
|
||||||
<layout>default</layout>
|
|
||||||
<snapshots>
|
|
||||||
<enabled>true</enabled>
|
|
||||||
</snapshots>
|
|
||||||
</repository>
|
|
||||||
</repositories>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
<version>4.13-rc-1</version>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot-starter-test</artifactId>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot-autoconfigure</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot-starter-web</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>hwu.elixir</groupId>
|
|
||||||
<artifactId>bmuse-core</artifactId>
|
|
||||||
<version>0.5.4</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.freemarker</groupId>
|
|
||||||
<artifactId>freemarker</artifactId>
|
|
||||||
<version>2.3.27-incubating</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.any23</groupId>
|
|
||||||
<artifactId>apache-any23-core</artifactId>
|
|
||||||
<version>2.3</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.eclipse.rdf4j</groupId>
|
|
||||||
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
|
||||||
<version>3.7.1</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.eclipse.rdf4j</groupId>
|
|
||||||
<artifactId>rdf4j-model</artifactId>
|
|
||||||
<version>3.7.1</version>
|
|
||||||
</dependency>
|
|
||||||
<!-- rdf 2.5.4 to 3.7.1-->
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.jsoup</groupId>
|
|
||||||
<artifactId>jsoup</artifactId>
|
|
||||||
<version>1.13.1</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.seleniumhq.selenium</groupId>
|
|
||||||
<artifactId>selenium-java</artifactId>
|
|
||||||
<version>3.141.59</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>commons-io</groupId>
|
|
||||||
<artifactId>commons-io</artifactId>
|
|
||||||
<version>2.6</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>commons-validator</groupId>
|
|
||||||
<artifactId>commons-validator</artifactId>
|
|
||||||
<version>1.6</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>ch.qos.logback</groupId>
|
|
||||||
<artifactId>logback-classic</artifactId>
|
|
||||||
<version>1.2.3</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
<build>
|
|
||||||
<plugins>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
|
||||||
<configuration>
|
|
||||||
<executable>true</executable>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
</build>
|
|
||||||
|
|
||||||
<properties>
|
|
||||||
<java.version>1.8</java.version>
|
|
||||||
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
|
||||||
</properties>
|
|
||||||
|
|
||||||
<profiles>
|
|
||||||
<profile>
|
|
||||||
<id>java8-doclint-disabled</id>
|
|
||||||
<activation>
|
|
||||||
<jdk>[1.8,)</jdk>
|
|
||||||
</activation>
|
|
||||||
<properties>
|
|
||||||
<javadoc.opts>-Xdoclint:none</javadoc.opts>
|
|
||||||
</properties>
|
|
||||||
</profile>
|
|
||||||
</profiles>
|
|
||||||
</project>
|
|
|
@ -1,13 +1,7 @@
|
||||||
package eu.dnetlib.bmuse_webapp;
|
package eu.dnetlib.bmuse_webapp;
|
||||||
|
|
||||||
import org.springframework.boot.web.client.RestTemplateBuilder;
|
|
||||||
import org.springframework.context.annotation.Bean;
|
|
||||||
import org.springframework.context.annotation.Configuration;
|
import org.springframework.context.annotation.Configuration;
|
||||||
import org.springframework.context.annotation.Profile;
|
import org.springframework.context.annotation.Profile;
|
||||||
import org.springframework.web.client.RestTemplate;
|
|
||||||
|
|
||||||
import freemarker.cache.ClassTemplateLoader;
|
|
||||||
import freemarker.template.TemplateExceptionHandler;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author enrico.ottonello
|
* @author enrico.ottonello
|
||||||
|
@ -17,29 +11,4 @@ import freemarker.template.TemplateExceptionHandler;
|
||||||
@Configuration
|
@Configuration
|
||||||
public class AppConfigGarr {
|
public class AppConfigGarr {
|
||||||
|
|
||||||
@Bean
|
|
||||||
public RestTemplate jrrRestTemplate(){
|
|
||||||
//TODO: move configuration here from CatalogueRegistrator?
|
|
||||||
return new RestTemplateBuilder().build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Bean
|
|
||||||
public freemarker.template.Configuration freemarkerConfig(){
|
|
||||||
freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27);
|
|
||||||
ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql");
|
|
||||||
config.setTemplateLoader(ctl);
|
|
||||||
config.setDefaultEncoding("UTF-8");
|
|
||||||
// Sets how errors will appear.
|
|
||||||
// During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better.
|
|
||||||
config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
|
|
||||||
|
|
||||||
// Don't log exceptions inside FreeMarker that it will thrown at you anyway:
|
|
||||||
config.setLogTemplateExceptions(false);
|
|
||||||
|
|
||||||
// Wrap unchecked exceptions thrown during template processing into TemplateException-s.
|
|
||||||
config.setWrapUncheckedExceptions(true);
|
|
||||||
|
|
||||||
return config;
|
|
||||||
}
|
|
||||||
}
|
}
|
|
@ -4,31 +4,24 @@ import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
|
||||||
import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper;
|
import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper;
|
||||||
import eu.dnetlib.bmuse_webapp.scraper.ScrapeState;
|
import eu.dnetlib.bmuse_webapp.scraper.ScrapeState;
|
||||||
import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread;
|
import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread;
|
||||||
import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper;
|
|
||||||
import eu.dnetlib.bmuse_webapp.utils.UrlParser;
|
import eu.dnetlib.bmuse_webapp.utils.UrlParser;
|
||||||
import hwu.elixir.utils.Helpers;
|
|
||||||
import org.apache.commons.io.FileUtils;
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.jsoup.select.Elements;
|
import org.jsoup.select.Elements;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.*;
|
import java.util.Date;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Properties;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
|
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
|
||||||
* Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL)
|
|
||||||
* and adds provenance to the CrawlRecord.
|
|
||||||
*
|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class ServiceScrapeDriver {
|
public class ServiceScrapeDriver {
|
||||||
|
@ -96,31 +89,15 @@ public class ServiceScrapeDriver {
|
||||||
|
|
||||||
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
||||||
while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
|
while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
|
||||||
logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
|
logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
|
||||||
|
|
||||||
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
|
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
|
||||||
scrape1.setName("S1");
|
scrape1.setName("S1");
|
||||||
|
|
||||||
// ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
|
||||||
// scrape2.setName("S2");
|
|
||||||
//
|
|
||||||
// ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
|
||||||
// scrape3.setName("S3");
|
|
||||||
//
|
|
||||||
// ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
|
||||||
// scrape4.setName("S4");
|
|
||||||
|
|
||||||
scrape1.start();
|
scrape1.start();
|
||||||
// scrape2.start();
|
|
||||||
// scrape3.start();
|
|
||||||
// scrape4.start();
|
|
||||||
long startTime = System.nanoTime();
|
long startTime = System.nanoTime();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
scrape1.join();
|
scrape1.join();
|
||||||
// scrape2.join();
|
|
||||||
// scrape3.join();
|
|
||||||
// scrape4.join();
|
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
logger.error("Exception waiting on thread");
|
logger.error("Exception waiting on thread");
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
@ -135,21 +112,13 @@ public class ServiceScrapeDriver {
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
|
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
|
||||||
|
|
||||||
long endTime = System.nanoTime();
|
long endTime = System.nanoTime();
|
||||||
long timeElapsed = endTime - startTime;
|
long timeElapsed = endTime - startTime;
|
||||||
logger.info("Time in s to complete: " + timeElapsed / 1e+9);
|
logger.debug("Time in s to complete: " + timeElapsed / 1e+9);
|
||||||
|
|
||||||
updateDatabase(scrapeState);
|
|
||||||
pagesCounter += numberOfPagesToCrawlInALoop;
|
pagesCounter += numberOfPagesToCrawlInALoop;
|
||||||
|
logger.debug("ENDED loop");
|
||||||
|
|
||||||
logger.info("ENDED loop");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Map<String, Object> nquads = scrapeState.getNquadsConcurrentHashMap();
|
|
||||||
// logger.info("Available nquads records: "+nquads.size() );
|
|
||||||
|
|
||||||
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
||||||
|
|
||||||
File output = new File(outputFolder.concat("/").concat(outputFilename));
|
File output = new File(outputFolder.concat("/").concat(outputFilename));
|
||||||
|
@ -173,20 +142,7 @@ public class ServiceScrapeDriver {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bufferedWriter.close();
|
bufferedWriter.close();
|
||||||
logger.info(" dump to "+output.getAbsolutePath());
|
logger.info(" Data stored into "+output.getAbsolutePath());
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param scrapeState State of scrape at end
|
|
||||||
* @return true if success / false otherwise
|
|
||||||
* @see ScrapeState
|
|
||||||
* @see CrawlRecord
|
|
||||||
*/
|
|
||||||
private boolean updateDatabase(ScrapeState scrapeState) {
|
|
||||||
boolean result = false;
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -256,6 +212,4 @@ public class ServiceScrapeDriver {
|
||||||
String[] parts = pageUrl.split("/");
|
String[] parts = pageUrl.split("/");
|
||||||
return parts[parts.length - 1];
|
return parts[parts.length - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,21 +3,19 @@ package eu.dnetlib.bmuse_webapp.publisher;
|
||||||
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
|
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
|
||||||
import eu.dnetlib.common.controller.AbstractDnetController;
|
import eu.dnetlib.common.controller.AbstractDnetController;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.io.LineIterator;
|
import org.apache.commons.io.LineIterator;
|
||||||
import org.apache.commons.logging.Log;
|
import org.slf4j.Logger;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.apache.tomcat.jni.FileInfo;
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
import org.springframework.web.bind.annotation.*;
|
import org.springframework.web.bind.annotation.RequestMethod;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
|
||||||
import javax.servlet.http.HttpServletResponse;
|
import javax.servlet.http.HttpServletResponse;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.FileReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author enrico.ottonello
|
* @author enrico.ottonello
|
||||||
|
@ -28,30 +26,31 @@ import java.util.List;
|
||||||
@RequestMapping("/api")
|
@RequestMapping("/api")
|
||||||
public class BMUSEWebappController extends AbstractDnetController {
|
public class BMUSEWebappController extends AbstractDnetController {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(BMUSEWebappController.class);
|
@Value("${outputFolder}")
|
||||||
|
private String outputFolder;
|
||||||
|
@Value("${outputDataPattern}")
|
||||||
|
private String outputDataPattern;
|
||||||
|
|
||||||
@RequestMapping(value = "/version", method = RequestMethod.GET)
|
private static Logger logger = LoggerFactory.getLogger(BMUSEWebappController.class);
|
||||||
public String version() throws BMUSEWebappException {
|
|
||||||
return "1.0.0-SNAPSHOT";
|
|
||||||
}
|
|
||||||
|
|
||||||
@RequestMapping(value = "/scrape", method = RequestMethod.GET)
|
@RequestMapping(value = "/startScraping", method = RequestMethod.GET)
|
||||||
public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
|
public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
|
||||||
|
|
||||||
|
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
||||||
|
|
||||||
log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
|
||||||
// String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
|
|
||||||
// String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
|
|
||||||
// String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
|
|
||||||
String sitemapUrlKey = "loc";
|
String sitemapUrlKey = "loc";
|
||||||
String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt");
|
String outputFilename = datasourceKey.concat(getOutputDataPattern());
|
||||||
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
||||||
service.start();
|
service.start();
|
||||||
return "started";
|
return "started";
|
||||||
}
|
}
|
||||||
|
|
||||||
@RequestMapping(value = "/nquads", method = RequestMethod.GET)
|
@RequestMapping(value = "/getNQuads", method = RequestMethod.GET)
|
||||||
public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException {
|
public String getNQuads(@RequestParam final String datasourceKey, HttpServletResponse response) throws BMUSEWebappException, IOException {
|
||||||
LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8");
|
|
||||||
|
logger.info("<GETNQUADS> datasourceKey: "+datasourceKey);
|
||||||
|
|
||||||
|
LineIterator it = FileUtils.lineIterator(new File(getOutputFolder().concat("/").concat(datasourceKey).concat(getOutputDataPattern())), "UTF-8");
|
||||||
try {
|
try {
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
String line = it.nextLine();
|
String line = it.nextLine();
|
||||||
|
@ -62,4 +61,12 @@ public class BMUSEWebappController extends AbstractDnetController {
|
||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getOutputFolder() {
|
||||||
|
return outputFolder;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOutputDataPattern() {
|
||||||
|
return outputDataPattern;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -1,7 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.bmuse_webapp.scraper;
|
package eu.dnetlib.bmuse_webapp.scraper;
|
||||||
|
|
||||||
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
|
|
||||||
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
||||||
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
||||||
import org.apache.any23.Any23;
|
import org.apache.any23.Any23;
|
||||||
|
@ -12,8 +11,6 @@ import org.apache.any23.writer.NTriplesWriter;
|
||||||
import org.apache.any23.writer.TripleHandler;
|
import org.apache.any23.writer.TripleHandler;
|
||||||
import org.apache.any23.writer.TripleHandlerException;
|
import org.apache.any23.writer.TripleHandlerException;
|
||||||
import org.apache.commons.io.output.ByteArrayOutputStream;
|
import org.apache.commons.io.output.ByteArrayOutputStream;
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
import org.eclipse.rdf4j.model.IRI;
|
import org.eclipse.rdf4j.model.IRI;
|
||||||
import org.eclipse.rdf4j.model.Model;
|
import org.eclipse.rdf4j.model.Model;
|
||||||
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
|
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
|
||||||
|
@ -27,7 +24,7 @@ import java.io.StringWriter;
|
||||||
|
|
||||||
public class BMUSEScraper extends ScraperFilteredCore {
|
public class BMUSEScraper extends ScraperFilteredCore {
|
||||||
|
|
||||||
private static final Log logger = LogFactory.getLog(BMUSEScraper.class);
|
private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class);
|
||||||
|
|
||||||
public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
|
public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
|
||||||
logger.debug(url + " > scraping");
|
logger.debug(url + " > scraping");
|
||||||
|
|
|
@ -6,13 +6,8 @@ import hwu.elixir.scrape.exceptions.CannotWriteException;
|
||||||
import hwu.elixir.scrape.exceptions.FourZeroFourException;
|
import hwu.elixir.scrape.exceptions.FourZeroFourException;
|
||||||
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
|
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
|
||||||
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
||||||
import org.apache.commons.logging.Log;
|
import org.slf4j.Logger;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
//import org.apache.commons.logging.Log;
|
|
||||||
//import org.apache.commons.logging.LogFactory;
|
|
||||||
//import org.slf4j.Logger;
|
|
||||||
//import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -28,7 +23,7 @@ public class ScrapeThread extends Thread {
|
||||||
private boolean fileWritten = true;
|
private boolean fileWritten = true;
|
||||||
private int scrapeVersion = 1;
|
private int scrapeVersion = 1;
|
||||||
|
|
||||||
private static final Log logger = LogFactory.getLog(ScrapeThread.class);
|
private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets up a thread for actually scrapping.
|
* Sets up a thread for actually scrapping.
|
||||||
|
@ -68,8 +63,7 @@ public class ScrapeThread extends Thread {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
|
String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
|
||||||
// scrapeState.addNquads(record.getName(), nquads);
|
logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.pagesLeftToScrape());
|
||||||
logger.info("downloaded "+record.getUrl());
|
|
||||||
record.setNquads(CompressorUtil.compressValue(nquads));
|
record.setNquads(CompressorUtil.compressValue(nquads));
|
||||||
if (!nquads.isEmpty()) {
|
if (!nquads.isEmpty()) {
|
||||||
scrapeState.addSuccessfulScrapedURL(record);
|
scrapeState.addSuccessfulScrapedURL(record);
|
||||||
|
|
|
@ -3,7 +3,6 @@ package eu.dnetlib.bmuse_webapp.scraper;
|
||||||
import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
|
import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
|
||||||
import hwu.elixir.scrape.exceptions.*;
|
import hwu.elixir.scrape.exceptions.*;
|
||||||
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
||||||
import org.apache.commons.lang.time.DateUtils;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -20,7 +19,7 @@ import org.slf4j.LoggerFactory;
|
||||||
*/
|
*/
|
||||||
public class ServiceScraper extends ScraperFilteredCore {
|
public class ServiceScraper extends ScraperFilteredCore {
|
||||||
|
|
||||||
private static Logger logger = LoggerFactory.getLogger(System.class.getName());
|
private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class);
|
||||||
|
|
||||||
private StatusOfScrape status= null;
|
private StatusOfScrape status= null;
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ server.port=8281
|
||||||
|
|
||||||
spring.profiles.active=garr
|
spring.profiles.active=garr
|
||||||
|
|
||||||
logging.file.name = /var/log/springboot/9480/oa_organizations.log
|
logging.file.name = /var/log/bioschemas/log/bioschemas-api.log
|
||||||
|
|
||||||
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
|
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
|
||||||
|
|
||||||
|
@ -17,7 +17,8 @@ management.endpoints.web.path-mapping.prometheus = metrics
|
||||||
management.endpoints.web.path-mapping.health = health
|
management.endpoints.web.path-mapping.health = health
|
||||||
|
|
||||||
waitTime=5
|
waitTime=5
|
||||||
outputFolder=/Users/enrico.ottonello/data/bmuse-output
|
outputFolder=/data
|
||||||
|
outputDataPattern=_base64_gzipped_nquads.txt
|
||||||
numberOfPagesToCrawlInALoop=8
|
numberOfPagesToCrawlInALoop=8
|
||||||
totalNumberOfPagesToCrawlInASession=32
|
totalNumberOfPagesToCrawlInASession=32
|
||||||
chromiumDriverLocation = /usr/local/bin/chromedriver
|
chromiumDriverLocation = /usr/local/bin/chromedriver
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<configuration>
|
|
||||||
<appender name="SAVE-TO-FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
|
||||||
<file>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log</file>
|
|
||||||
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
|
|
||||||
<Pattern>%d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n</Pattern>
|
|
||||||
</encoder>
|
|
||||||
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
|
|
||||||
<fileNamePattern>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log</fileNamePattern>
|
|
||||||
<maxFileSize>10MB</maxFileSize>
|
|
||||||
<maxHistory>10</maxHistory>
|
|
||||||
<totalSizeCap>100MB</totalSizeCap>
|
|
||||||
</rollingPolicy>
|
|
||||||
</appender>
|
|
||||||
<logger name="org.springframework" level="INFO" additivity="false">
|
|
||||||
<appender-ref ref="SAVE-TO-FILE" />
|
|
||||||
</logger>
|
|
||||||
<logger name="root" level="INFO" additivity="false">
|
|
||||||
<appender-ref ref="SAVE-TO-FILE" />
|
|
||||||
</logger>
|
|
||||||
<logger name="eu.dnetlib" level="INFO" additivity="false">
|
|
||||||
<appender-ref ref="SAVE-TO-FILE" />
|
|
||||||
</logger>
|
|
||||||
<logger name="eu.dnetlib.bmuse_webapp" level="INFO" additivity="false">
|
|
||||||
<appender-ref ref="SAVE-TO-FILE" />
|
|
||||||
</logger>
|
|
||||||
<logger name="hwu.elixir" level="INFO" additivity="false">
|
|
||||||
<appender-ref ref="SAVE-TO-FILE" />
|
|
||||||
</logger>
|
|
||||||
</configuration>
|
|
38
pom.xml
38
pom.xml
|
@ -278,6 +278,43 @@
|
||||||
</exclusions>
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<!-- Bioschemas BMUSE -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>hwu.elixir</groupId>
|
||||||
|
<artifactId>bmuse-core</artifactId>
|
||||||
|
<version>0.5.4</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.any23</groupId>
|
||||||
|
<artifactId>apache-any23-core</artifactId>
|
||||||
|
<version>2.3</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.eclipse.rdf4j</groupId>
|
||||||
|
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
||||||
|
<version>3.7.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.eclipse.rdf4j</groupId>
|
||||||
|
<artifactId>rdf4j-model</artifactId>
|
||||||
|
<version>3.7.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jsoup</groupId>
|
||||||
|
<artifactId>jsoup</artifactId>
|
||||||
|
<version>1.13.1</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.seleniumhq.selenium</groupId>
|
||||||
|
<artifactId>selenium-java</artifactId>
|
||||||
|
<version>3.141.59</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-validator</groupId>
|
||||||
|
<artifactId>commons-validator</artifactId>
|
||||||
|
<version>1.6</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</dependencyManagement>
|
</dependencyManagement>
|
||||||
|
|
||||||
|
@ -418,5 +455,6 @@
|
||||||
<javamelody.version>1.71.0</javamelody.version>
|
<javamelody.version>1.71.0</javamelody.version>
|
||||||
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
||||||
<dockerfile-maven-version>1.3.6</dockerfile-maven-version>
|
<dockerfile-maven-version>1.3.6</dockerfile-maven-version>
|
||||||
|
<bioschemas-commons-io-version>2.6</bioschemas-commons-io-version>
|
||||||
</properties>
|
</properties>
|
||||||
</project>
|
</project>
|
||||||
|
|
Loading…
Reference in New Issue