fixed dependencies declaration, configuration properties
This commit is contained in:
parent
7375534764
commit
079b2506e6
|
@ -12,62 +12,38 @@
|
|||
<artifactId>bioschemas-api</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-test</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>hwu.elixir</groupId>
|
||||
<artifactId>bmuse-core</artifactId>
|
||||
<version>0.5.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.freemarker</groupId>
|
||||
<artifactId>freemarker</artifactId>
|
||||
<version>2.3.27-incubating</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.any23</groupId>
|
||||
<artifactId>apache-any23-core</artifactId>
|
||||
<version>2.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.rdf4j</groupId>
|
||||
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.rdf4j</groupId>
|
||||
<artifactId>rdf4j-model</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</dependency>
|
||||
<!-- rdf 2.5.4 to 3.7.1-->
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.13.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
<version>3.141.59</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>2.6</version>
|
||||
<version>${bioschemas-commons-io-version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-validator</groupId>
|
||||
<artifactId>commons-validator</artifactId>
|
||||
<version>1.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
<version>1.2.3</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
|
|
|
@ -1,173 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-parent</artifactId>
|
||||
<version>2.1.3.RELEASE</version>
|
||||
</parent>
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-bmuse-webapp</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
<scm>
|
||||
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk</developerConnection>
|
||||
<url>https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp</url>
|
||||
</scm>
|
||||
<ciManagement>
|
||||
<system>jenkins</system>
|
||||
<url>https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/</url>
|
||||
</ciManagement>
|
||||
<distributionManagement>
|
||||
<repository>
|
||||
<id>dnet5-releases</id>
|
||||
<name>D-Net 5 Releases</name>
|
||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
|
||||
<layout>default</layout>
|
||||
</repository>
|
||||
</distributionManagement>
|
||||
|
||||
<!-- Inherit defaults from Spring Boot -->
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>dnet-deps</id>
|
||||
<name>D-Net Dependencies</name>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
<layout>default</layout>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>dnet5-releases</id>
|
||||
<name>D-Net 5 Releases</name>
|
||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
|
||||
<layout>default</layout>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>dnet5-snapshots</id>
|
||||
<name>D-Net 5 Snapshots</name>
|
||||
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots</url>
|
||||
<layout>default</layout>
|
||||
<snapshots>
|
||||
<enabled>true</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.13-rc-1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-test</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-autoconfigure</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-web</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>hwu.elixir</groupId>
|
||||
<artifactId>bmuse-core</artifactId>
|
||||
<version>0.5.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.freemarker</groupId>
|
||||
<artifactId>freemarker</artifactId>
|
||||
<version>2.3.27-incubating</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.any23</groupId>
|
||||
<artifactId>apache-any23-core</artifactId>
|
||||
<version>2.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.rdf4j</groupId>
|
||||
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.rdf4j</groupId>
|
||||
<artifactId>rdf4j-model</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</dependency>
|
||||
<!-- rdf 2.5.4 to 3.7.1-->
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.13.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
<version>3.141.59</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>2.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-validator</groupId>
|
||||
<artifactId>commons-validator</artifactId>
|
||||
<version>1.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>ch.qos.logback</groupId>
|
||||
<artifactId>logback-classic</artifactId>
|
||||
<version>1.2.3</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<executable>true</executable>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<properties>
|
||||
<java.version>1.8</java.version>
|
||||
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
||||
</properties>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>java8-doclint-disabled</id>
|
||||
<activation>
|
||||
<jdk>[1.8,)</jdk>
|
||||
</activation>
|
||||
<properties>
|
||||
<javadoc.opts>-Xdoclint:none</javadoc.opts>
|
||||
</properties>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
|
@ -1,13 +1,7 @@
|
|||
package eu.dnetlib.bmuse_webapp;
|
||||
|
||||
import org.springframework.boot.web.client.RestTemplateBuilder;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Profile;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
import freemarker.cache.ClassTemplateLoader;
|
||||
import freemarker.template.TemplateExceptionHandler;
|
||||
|
||||
/**
|
||||
* @author enrico.ottonello
|
||||
|
@ -17,29 +11,4 @@ import freemarker.template.TemplateExceptionHandler;
|
|||
@Configuration
|
||||
public class AppConfigGarr {
|
||||
|
||||
@Bean
|
||||
public RestTemplate jrrRestTemplate(){
|
||||
//TODO: move configuration here from CatalogueRegistrator?
|
||||
return new RestTemplateBuilder().build();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public freemarker.template.Configuration freemarkerConfig(){
|
||||
freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27);
|
||||
ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql");
|
||||
config.setTemplateLoader(ctl);
|
||||
config.setDefaultEncoding("UTF-8");
|
||||
// Sets how errors will appear.
|
||||
// During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better.
|
||||
config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
|
||||
|
||||
// Don't log exceptions inside FreeMarker that it will thrown at you anyway:
|
||||
config.setLogTemplateExceptions(false);
|
||||
|
||||
// Wrap unchecked exceptions thrown during template processing into TemplateException-s.
|
||||
config.setWrapUncheckedExceptions(true);
|
||||
|
||||
return config;
|
||||
}
|
||||
}
|
|
@ -4,31 +4,24 @@ import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
|
|||
import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper;
|
||||
import eu.dnetlib.bmuse_webapp.scraper.ScrapeState;
|
||||
import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread;
|
||||
import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper;
|
||||
import eu.dnetlib.bmuse_webapp.utils.UrlParser;
|
||||
import hwu.elixir.utils.Helpers;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
||||
/**
|
||||
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
|
||||
* Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL)
|
||||
* and adds provenance to the CrawlRecord.
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class ServiceScrapeDriver {
|
||||
|
@ -96,31 +89,15 @@ public class ServiceScrapeDriver {
|
|||
|
||||
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
||||
while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
|
||||
logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
|
||||
logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
|
||||
|
||||
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
|
||||
scrape1.setName("S1");
|
||||
|
||||
// ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
||||
// scrape2.setName("S2");
|
||||
//
|
||||
// ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
||||
// scrape3.setName("S3");
|
||||
//
|
||||
// ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
|
||||
// scrape4.setName("S4");
|
||||
|
||||
scrape1.start();
|
||||
// scrape2.start();
|
||||
// scrape3.start();
|
||||
// scrape4.start();
|
||||
long startTime = System.nanoTime();
|
||||
|
||||
try {
|
||||
scrape1.join();
|
||||
// scrape2.join();
|
||||
// scrape3.join();
|
||||
// scrape4.join();
|
||||
} catch (InterruptedException e) {
|
||||
logger.error("Exception waiting on thread");
|
||||
e.printStackTrace();
|
||||
|
@ -135,21 +112,13 @@ public class ServiceScrapeDriver {
|
|||
}
|
||||
|
||||
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
|
||||
|
||||
long endTime = System.nanoTime();
|
||||
long timeElapsed = endTime - startTime;
|
||||
logger.info("Time in s to complete: " + timeElapsed / 1e+9);
|
||||
|
||||
updateDatabase(scrapeState);
|
||||
logger.debug("Time in s to complete: " + timeElapsed / 1e+9);
|
||||
pagesCounter += numberOfPagesToCrawlInALoop;
|
||||
|
||||
|
||||
logger.info("ENDED loop");
|
||||
logger.debug("ENDED loop");
|
||||
}
|
||||
|
||||
// Map<String, Object> nquads = scrapeState.getNquadsConcurrentHashMap();
|
||||
// logger.info("Available nquads records: "+nquads.size() );
|
||||
|
||||
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
|
||||
|
||||
File output = new File(outputFolder.concat("/").concat(outputFilename));
|
||||
|
@ -173,20 +142,7 @@ public class ServiceScrapeDriver {
|
|||
}
|
||||
}
|
||||
bufferedWriter.close();
|
||||
logger.info(" dump to "+output.getAbsolutePath());
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param scrapeState State of scrape at end
|
||||
* @return true if success / false otherwise
|
||||
* @see ScrapeState
|
||||
* @see CrawlRecord
|
||||
*/
|
||||
private boolean updateDatabase(ScrapeState scrapeState) {
|
||||
boolean result = false;
|
||||
|
||||
return result;
|
||||
logger.info(" Data stored into "+output.getAbsolutePath());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -256,6 +212,4 @@ public class ServiceScrapeDriver {
|
|||
String[] parts = pageUrl.split("/");
|
||||
return parts[parts.length - 1];
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -3,21 +3,19 @@ package eu.dnetlib.bmuse_webapp.publisher;
|
|||
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
|
||||
import eu.dnetlib.common.controller.AbstractDnetController;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.LineIterator;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tomcat.jni.FileInfo;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMethod;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author enrico.ottonello
|
||||
|
@ -28,30 +26,31 @@ import java.util.List;
|
|||
@RequestMapping("/api")
|
||||
public class BMUSEWebappController extends AbstractDnetController {
|
||||
|
||||
private static final Log log = LogFactory.getLog(BMUSEWebappController.class);
|
||||
@Value("${outputFolder}")
|
||||
private String outputFolder;
|
||||
@Value("${outputDataPattern}")
|
||||
private String outputDataPattern;
|
||||
|
||||
@RequestMapping(value = "/version", method = RequestMethod.GET)
|
||||
public String version() throws BMUSEWebappException {
|
||||
return "1.0.0-SNAPSHOT";
|
||||
}
|
||||
private static Logger logger = LoggerFactory.getLogger(BMUSEWebappController.class);
|
||||
|
||||
@RequestMapping(value = "/scrape", method = RequestMethod.GET)
|
||||
public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
|
||||
@RequestMapping(value = "/startScraping", method = RequestMethod.GET)
|
||||
public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
|
||||
|
||||
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
||||
|
||||
log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
|
||||
// String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
|
||||
// String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
|
||||
// String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
|
||||
String sitemapUrlKey = "loc";
|
||||
String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt");
|
||||
String outputFilename = datasourceKey.concat(getOutputDataPattern());
|
||||
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
|
||||
service.start();
|
||||
return "started";
|
||||
}
|
||||
|
||||
@RequestMapping(value = "/nquads", method = RequestMethod.GET)
|
||||
public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException {
|
||||
LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8");
|
||||
@RequestMapping(value = "/getNQuads", method = RequestMethod.GET)
|
||||
public String getNQuads(@RequestParam final String datasourceKey, HttpServletResponse response) throws BMUSEWebappException, IOException {
|
||||
|
||||
logger.info("<GETNQUADS> datasourceKey: "+datasourceKey);
|
||||
|
||||
LineIterator it = FileUtils.lineIterator(new File(getOutputFolder().concat("/").concat(datasourceKey).concat(getOutputDataPattern())), "UTF-8");
|
||||
try {
|
||||
while (it.hasNext()) {
|
||||
String line = it.nextLine();
|
||||
|
@ -62,4 +61,12 @@ public class BMUSEWebappController extends AbstractDnetController {
|
|||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
public String getOutputFolder() {
|
||||
return outputFolder;
|
||||
}
|
||||
|
||||
public String getOutputDataPattern() {
|
||||
return outputDataPattern;
|
||||
}
|
||||
}
|
|
@ -1,7 +1,6 @@
|
|||
|
||||
package eu.dnetlib.bmuse_webapp.scraper;
|
||||
|
||||
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
|
||||
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
||||
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
||||
import org.apache.any23.Any23;
|
||||
|
@ -12,8 +11,6 @@ import org.apache.any23.writer.NTriplesWriter;
|
|||
import org.apache.any23.writer.TripleHandler;
|
||||
import org.apache.any23.writer.TripleHandlerException;
|
||||
import org.apache.commons.io.output.ByteArrayOutputStream;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.eclipse.rdf4j.model.IRI;
|
||||
import org.eclipse.rdf4j.model.Model;
|
||||
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
|
||||
|
@ -27,7 +24,7 @@ import java.io.StringWriter;
|
|||
|
||||
public class BMUSEScraper extends ScraperFilteredCore {
|
||||
|
||||
private static final Log logger = LogFactory.getLog(BMUSEScraper.class);
|
||||
private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class);
|
||||
|
||||
public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
|
||||
logger.debug(url + " > scraping");
|
||||
|
|
|
@ -6,13 +6,8 @@ import hwu.elixir.scrape.exceptions.CannotWriteException;
|
|||
import hwu.elixir.scrape.exceptions.FourZeroFourException;
|
||||
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
|
||||
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
//import org.apache.commons.logging.Log;
|
||||
//import org.apache.commons.logging.LogFactory;
|
||||
//import org.slf4j.Logger;
|
||||
//import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
|
@ -28,7 +23,7 @@ public class ScrapeThread extends Thread {
|
|||
private boolean fileWritten = true;
|
||||
private int scrapeVersion = 1;
|
||||
|
||||
private static final Log logger = LogFactory.getLog(ScrapeThread.class);
|
||||
private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class);
|
||||
|
||||
/**
|
||||
* Sets up a thread for actually scrapping.
|
||||
|
@ -68,8 +63,7 @@ public class ScrapeThread extends Thread {
|
|||
|
||||
try {
|
||||
String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
|
||||
// scrapeState.addNquads(record.getName(), nquads);
|
||||
logger.info("downloaded "+record.getUrl());
|
||||
logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.pagesLeftToScrape());
|
||||
record.setNquads(CompressorUtil.compressValue(nquads));
|
||||
if (!nquads.isEmpty()) {
|
||||
scrapeState.addSuccessfulScrapedURL(record);
|
||||
|
|
|
@ -3,7 +3,6 @@ package eu.dnetlib.bmuse_webapp.scraper;
|
|||
import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
|
||||
import hwu.elixir.scrape.exceptions.*;
|
||||
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
||||
import org.apache.commons.lang.time.DateUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -20,7 +19,7 @@ import org.slf4j.LoggerFactory;
|
|||
*/
|
||||
public class ServiceScraper extends ScraperFilteredCore {
|
||||
|
||||
private static Logger logger = LoggerFactory.getLogger(System.class.getName());
|
||||
private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class);
|
||||
|
||||
private StatusOfScrape status= null;
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ server.port=8281
|
|||
|
||||
spring.profiles.active=garr
|
||||
|
||||
logging.file.name = /var/log/springboot/9480/oa_organizations.log
|
||||
logging.file.name = /var/log/bioschemas/log/bioschemas-api.log
|
||||
|
||||
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
|
||||
|
||||
|
@ -17,7 +17,8 @@ management.endpoints.web.path-mapping.prometheus = metrics
|
|||
management.endpoints.web.path-mapping.health = health
|
||||
|
||||
waitTime=5
|
||||
outputFolder=/Users/enrico.ottonello/data/bmuse-output
|
||||
outputFolder=/data
|
||||
outputDataPattern=_base64_gzipped_nquads.txt
|
||||
numberOfPagesToCrawlInALoop=8
|
||||
totalNumberOfPagesToCrawlInASession=32
|
||||
chromiumDriverLocation = /usr/local/bin/chromedriver
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<configuration>
|
||||
<appender name="SAVE-TO-FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
||||
<file>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log</file>
|
||||
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
|
||||
<Pattern>%d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n</Pattern>
|
||||
</encoder>
|
||||
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
|
||||
<fileNamePattern>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log</fileNamePattern>
|
||||
<maxFileSize>10MB</maxFileSize>
|
||||
<maxHistory>10</maxHistory>
|
||||
<totalSizeCap>100MB</totalSizeCap>
|
||||
</rollingPolicy>
|
||||
</appender>
|
||||
<logger name="org.springframework" level="INFO" additivity="false">
|
||||
<appender-ref ref="SAVE-TO-FILE" />
|
||||
</logger>
|
||||
<logger name="root" level="INFO" additivity="false">
|
||||
<appender-ref ref="SAVE-TO-FILE" />
|
||||
</logger>
|
||||
<logger name="eu.dnetlib" level="INFO" additivity="false">
|
||||
<appender-ref ref="SAVE-TO-FILE" />
|
||||
</logger>
|
||||
<logger name="eu.dnetlib.bmuse_webapp" level="INFO" additivity="false">
|
||||
<appender-ref ref="SAVE-TO-FILE" />
|
||||
</logger>
|
||||
<logger name="hwu.elixir" level="INFO" additivity="false">
|
||||
<appender-ref ref="SAVE-TO-FILE" />
|
||||
</logger>
|
||||
</configuration>
|
38
pom.xml
38
pom.xml
|
@ -278,6 +278,43 @@
|
|||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<!-- Bioschemas BMUSE -->
|
||||
<dependency>
|
||||
<groupId>hwu.elixir</groupId>
|
||||
<artifactId>bmuse-core</artifactId>
|
||||
<version>0.5.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.any23</groupId>
|
||||
<artifactId>apache-any23-core</artifactId>
|
||||
<version>2.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.rdf4j</groupId>
|
||||
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.rdf4j</groupId>
|
||||
<artifactId>rdf4j-model</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.13.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
<version>3.141.59</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-validator</groupId>
|
||||
<artifactId>commons-validator</artifactId>
|
||||
<version>1.6</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
|
@ -418,5 +455,6 @@
|
|||
<javamelody.version>1.71.0</javamelody.version>
|
||||
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
|
||||
<dockerfile-maven-version>1.3.6</dockerfile-maven-version>
|
||||
<bioschemas-commons-io-version>2.6</bioschemas-commons-io-version>
|
||||
</properties>
|
||||
</project>
|
||||
|
|
Loading…
Reference in New Issue