fixed dependencies declaration, configuration properties

This commit is contained in:
Enrico Ottonello 2022-06-07 13:02:36 +02:00
parent 7375534764
commit 079b2506e6
11 changed files with 87 additions and 355 deletions

View File

@ -12,62 +12,38 @@
<artifactId>bioschemas-api</artifactId> <artifactId>bioschemas-api</artifactId>
<dependencies> <dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency> <dependency>
<groupId>hwu.elixir</groupId> <groupId>hwu.elixir</groupId>
<artifactId>bmuse-core</artifactId> <artifactId>bmuse-core</artifactId>
<version>0.5.4</version>
</dependency> </dependency>
<dependency>
<groupId>org.freemarker</groupId>
<artifactId>freemarker</artifactId>
<version>2.3.27-incubating</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.any23</groupId> <groupId>org.apache.any23</groupId>
<artifactId>apache-any23-core</artifactId> <artifactId>apache-any23-core</artifactId>
<version>2.3</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.eclipse.rdf4j</groupId> <groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-rio-rdfxml</artifactId> <artifactId>rdf4j-rio-rdfxml</artifactId>
<version>3.7.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.eclipse.rdf4j</groupId> <groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-model</artifactId> <artifactId>rdf4j-model</artifactId>
<version>3.7.1</version>
</dependency> </dependency>
<!-- rdf 2.5.4 to 3.7.1-->
<dependency> <dependency>
<groupId>org.jsoup</groupId> <groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId> <artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.seleniumhq.selenium</groupId> <groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId> <artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-io</groupId> <groupId>commons-io</groupId>
<artifactId>commons-io</artifactId> <artifactId>commons-io</artifactId>
<version>2.6</version> <version>${bioschemas-commons-io-version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-validator</groupId> <groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId> <artifactId>commons-validator</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency> </dependency>
</dependencies> </dependencies>
<build> <build>

View File

@ -1,173 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.3.RELEASE</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-bmuse-webapp</artifactId>
<packaging>jar</packaging>
<version>1.0.0-SNAPSHOT</version>
<scm>
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk</developerConnection>
<url>https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp</url>
</scm>
<ciManagement>
<system>jenkins</system>
<url>https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/</url>
</ciManagement>
<distributionManagement>
<repository>
<id>dnet5-releases</id>
<name>D-Net 5 Releases</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
<layout>default</layout>
</repository>
</distributionManagement>
<!-- Inherit defaults from Spring Boot -->
<repositories>
<repository>
<id>dnet-deps</id>
<name>D-Net Dependencies</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
<layout>default</layout>
</repository>
<repository>
<id>dnet5-releases</id>
<name>D-Net 5 Releases</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases</url>
<layout>default</layout>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>dnet5-snapshots</id>
<name>D-Net 5 Snapshots</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots</url>
<layout>default</layout>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13-rc-1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-autoconfigure</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>hwu.elixir</groupId>
<artifactId>bmuse-core</artifactId>
<version>0.5.4</version>
</dependency>
<dependency>
<groupId>org.freemarker</groupId>
<artifactId>freemarker</artifactId>
<version>2.3.27-incubating</version>
</dependency>
<dependency>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-core</artifactId>
<version>2.3</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-rio-rdfxml</artifactId>
<version>3.7.1</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-model</artifactId>
<version>3.7.1</version>
</dependency>
<!-- rdf 2.5.4 to 3.7.1-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<executable>true</executable>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<java.version>1.8</java.version>
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
</properties>
<profiles>
<profile>
<id>java8-doclint-disabled</id>
<activation>
<jdk>[1.8,)</jdk>
</activation>
<properties>
<javadoc.opts>-Xdoclint:none</javadoc.opts>
</properties>
</profile>
</profiles>
</project>

View File

@ -1,13 +1,7 @@
package eu.dnetlib.bmuse_webapp; package eu.dnetlib.bmuse_webapp;
import org.springframework.boot.web.client.RestTemplateBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Profile; import org.springframework.context.annotation.Profile;
import org.springframework.web.client.RestTemplate;
import freemarker.cache.ClassTemplateLoader;
import freemarker.template.TemplateExceptionHandler;
/** /**
* @author enrico.ottonello * @author enrico.ottonello
@ -17,29 +11,4 @@ import freemarker.template.TemplateExceptionHandler;
@Configuration @Configuration
public class AppConfigGarr { public class AppConfigGarr {
@Bean
public RestTemplate jrrRestTemplate(){
//TODO: move configuration here from CatalogueRegistrator?
return new RestTemplateBuilder().build();
}
@Bean
public freemarker.template.Configuration freemarkerConfig(){
freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27);
ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql");
config.setTemplateLoader(ctl);
config.setDefaultEncoding("UTF-8");
// Sets how errors will appear.
// During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better.
config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
// Don't log exceptions inside FreeMarker that it will thrown at you anyway:
config.setLogTemplateExceptions(false);
// Wrap unchecked exceptions thrown during template processing into TemplateException-s.
config.setWrapUncheckedExceptions(true);
return config;
}
} }

View File

@ -4,31 +4,24 @@ import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper; import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper;
import eu.dnetlib.bmuse_webapp.scraper.ScrapeState; import eu.dnetlib.bmuse_webapp.scraper.ScrapeState;
import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread; import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread;
import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper;
import eu.dnetlib.bmuse_webapp.utils.UrlParser; import eu.dnetlib.bmuse_webapp.utils.UrlParser;
import hwu.elixir.utils.Helpers;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*; import java.io.*;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.Date;
import java.util.concurrent.ConcurrentHashMap; import java.util.List;
import java.util.Objects;
import java.util.Properties;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
/** /**
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape. * Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
* Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL)
* and adds provenance to the CrawlRecord.
*
* *
*/ */
public class ServiceScrapeDriver { public class ServiceScrapeDriver {
@ -96,31 +89,15 @@ public class ServiceScrapeDriver {
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis()))); logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
while (pagesCounter < totalNumberOfPagesToCrawlInASession) { while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession); logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion); ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
scrape1.setName("S1"); scrape1.setName("S1");
// ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
// scrape2.setName("S2");
//
// ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
// scrape3.setName("S3");
//
// ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
// scrape4.setName("S4");
scrape1.start(); scrape1.start();
// scrape2.start();
// scrape3.start();
// scrape4.start();
long startTime = System.nanoTime(); long startTime = System.nanoTime();
try { try {
scrape1.join(); scrape1.join();
// scrape2.join();
// scrape3.join();
// scrape4.join();
} catch (InterruptedException e) { } catch (InterruptedException e) {
logger.error("Exception waiting on thread"); logger.error("Exception waiting on thread");
e.printStackTrace(); e.printStackTrace();
@ -135,21 +112,13 @@ public class ServiceScrapeDriver {
} }
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten()); logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
long endTime = System.nanoTime(); long endTime = System.nanoTime();
long timeElapsed = endTime - startTime; long timeElapsed = endTime - startTime;
logger.info("Time in s to complete: " + timeElapsed / 1e+9); logger.debug("Time in s to complete: " + timeElapsed / 1e+9);
updateDatabase(scrapeState);
pagesCounter += numberOfPagesToCrawlInALoop; pagesCounter += numberOfPagesToCrawlInALoop;
logger.debug("ENDED loop");
logger.info("ENDED loop");
} }
// Map<String, Object> nquads = scrapeState.getNquadsConcurrentHashMap();
// logger.info("Available nquads records: "+nquads.size() );
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis()))); logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
File output = new File(outputFolder.concat("/").concat(outputFilename)); File output = new File(outputFolder.concat("/").concat(outputFilename));
@ -173,20 +142,7 @@ public class ServiceScrapeDriver {
} }
} }
bufferedWriter.close(); bufferedWriter.close();
logger.info(" dump to "+output.getAbsolutePath()); logger.info(" Data stored into "+output.getAbsolutePath());
}
/**
*
* @param scrapeState State of scrape at end
* @return true if success / false otherwise
* @see ScrapeState
* @see CrawlRecord
*/
private boolean updateDatabase(ScrapeState scrapeState) {
boolean result = false;
return result;
} }
/** /**
@ -256,6 +212,4 @@ public class ServiceScrapeDriver {
String[] parts = pageUrl.split("/"); String[] parts = pageUrl.split("/");
return parts[parts.length - 1]; return parts[parts.length - 1];
} }
} }

View File

@ -3,21 +3,19 @@ package eu.dnetlib.bmuse_webapp.publisher;
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver; import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
import eu.dnetlib.common.controller.AbstractDnetController; import eu.dnetlib.common.controller.AbstractDnetController;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator; import org.apache.commons.io.LineIterator;
import org.apache.commons.logging.Log; import org.slf4j.Logger;
import org.apache.commons.logging.LogFactory; import org.slf4j.LoggerFactory;
import org.apache.tomcat.jni.FileInfo; import org.springframework.beans.factory.annotation.Value;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.*; import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpServletResponse;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.List;
/** /**
* @author enrico.ottonello * @author enrico.ottonello
@ -28,30 +26,31 @@ import java.util.List;
@RequestMapping("/api") @RequestMapping("/api")
public class BMUSEWebappController extends AbstractDnetController { public class BMUSEWebappController extends AbstractDnetController {
private static final Log log = LogFactory.getLog(BMUSEWebappController.class); @Value("${outputFolder}")
private String outputFolder;
@Value("${outputDataPattern}")
private String outputDataPattern;
@RequestMapping(value = "/version", method = RequestMethod.GET) private static Logger logger = LoggerFactory.getLogger(BMUSEWebappController.class);
public String version() throws BMUSEWebappException {
return "1.0.0-SNAPSHOT";
}
@RequestMapping(value = "/scrape", method = RequestMethod.GET) @RequestMapping(value = "/startScraping", method = RequestMethod.GET)
public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException { public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
logger.info("<STARTSCRAPING> datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
// String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
// String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
// String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
String sitemapUrlKey = "loc"; String sitemapUrlKey = "loc";
String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt"); String outputFilename = datasourceKey.concat(getOutputDataPattern());
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename); ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
service.start(); service.start();
return "started"; return "started";
} }
@RequestMapping(value = "/nquads", method = RequestMethod.GET) @RequestMapping(value = "/getNQuads", method = RequestMethod.GET)
public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException { public String getNQuads(@RequestParam final String datasourceKey, HttpServletResponse response) throws BMUSEWebappException, IOException {
LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8");
logger.info("<GETNQUADS> datasourceKey: "+datasourceKey);
LineIterator it = FileUtils.lineIterator(new File(getOutputFolder().concat("/").concat(datasourceKey).concat(getOutputDataPattern())), "UTF-8");
try { try {
while (it.hasNext()) { while (it.hasNext()) {
String line = it.nextLine(); String line = it.nextLine();
@ -62,4 +61,12 @@ public class BMUSEWebappController extends AbstractDnetController {
} }
return ""; return "";
} }
public String getOutputFolder() {
return outputFolder;
}
public String getOutputDataPattern() {
return outputDataPattern;
}
} }

View File

@ -1,7 +1,6 @@
package eu.dnetlib.bmuse_webapp.scraper; package eu.dnetlib.bmuse_webapp.scraper;
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
import hwu.elixir.scrape.exceptions.MissingMarkupException; import hwu.elixir.scrape.exceptions.MissingMarkupException;
import hwu.elixir.scrape.scraper.ScraperFilteredCore; import hwu.elixir.scrape.scraper.ScraperFilteredCore;
import org.apache.any23.Any23; import org.apache.any23.Any23;
@ -12,8 +11,6 @@ import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException; import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
@ -27,7 +24,7 @@ import java.io.StringWriter;
public class BMUSEScraper extends ScraperFilteredCore { public class BMUSEScraper extends ScraperFilteredCore {
private static final Log logger = LogFactory.getLog(BMUSEScraper.class); private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class);
public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception { public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
logger.debug(url + " > scraping"); logger.debug(url + " > scraping");

View File

@ -6,13 +6,8 @@ import hwu.elixir.scrape.exceptions.CannotWriteException;
import hwu.elixir.scrape.exceptions.FourZeroFourException; import hwu.elixir.scrape.exceptions.FourZeroFourException;
import hwu.elixir.scrape.exceptions.JsonLDInspectionException; import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
import hwu.elixir.scrape.exceptions.MissingMarkupException; import hwu.elixir.scrape.exceptions.MissingMarkupException;
import org.apache.commons.logging.Log; import org.slf4j.Logger;
import org.apache.commons.logging.LogFactory; import org.slf4j.LoggerFactory;
//import org.apache.commons.logging.Log;
//import org.apache.commons.logging.LogFactory;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
import java.util.Date; import java.util.Date;
/** /**
@ -28,7 +23,7 @@ public class ScrapeThread extends Thread {
private boolean fileWritten = true; private boolean fileWritten = true;
private int scrapeVersion = 1; private int scrapeVersion = 1;
private static final Log logger = LogFactory.getLog(ScrapeThread.class); private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class);
/** /**
* Sets up a thread for actually scrapping. * Sets up a thread for actually scrapping.
@ -68,8 +63,7 @@ public class ScrapeThread extends Thread {
try { try {
String nquads = process.getNQUADSFromUrl(record.getUrl(), true); String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
// scrapeState.addNquads(record.getName(), nquads); logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.pagesLeftToScrape());
logger.info("downloaded "+record.getUrl());
record.setNquads(CompressorUtil.compressValue(nquads)); record.setNquads(CompressorUtil.compressValue(nquads));
if (!nquads.isEmpty()) { if (!nquads.isEmpty()) {
scrapeState.addSuccessfulScrapedURL(record); scrapeState.addSuccessfulScrapedURL(record);

View File

@ -3,7 +3,6 @@ package eu.dnetlib.bmuse_webapp.scraper;
import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape; import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
import hwu.elixir.scrape.exceptions.*; import hwu.elixir.scrape.exceptions.*;
import hwu.elixir.scrape.scraper.ScraperFilteredCore; import hwu.elixir.scrape.scraper.ScraperFilteredCore;
import org.apache.commons.lang.time.DateUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -20,7 +19,7 @@ import org.slf4j.LoggerFactory;
*/ */
public class ServiceScraper extends ScraperFilteredCore { public class ServiceScraper extends ScraperFilteredCore {
private static Logger logger = LoggerFactory.getLogger(System.class.getName()); private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class);
private StatusOfScrape status= null; private StatusOfScrape status= null;

View File

@ -3,7 +3,7 @@ server.port=8281
spring.profiles.active=garr spring.profiles.active=garr
logging.file.name = /var/log/springboot/9480/oa_organizations.log logging.file.name = /var/log/bioschemas/log/bioschemas-api.log
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
@ -17,7 +17,8 @@ management.endpoints.web.path-mapping.prometheus = metrics
management.endpoints.web.path-mapping.health = health management.endpoints.web.path-mapping.health = health
waitTime=5 waitTime=5
outputFolder=/Users/enrico.ottonello/data/bmuse-output outputFolder=/data
outputDataPattern=_base64_gzipped_nquads.txt
numberOfPagesToCrawlInALoop=8 numberOfPagesToCrawlInALoop=8
totalNumberOfPagesToCrawlInASession=32 totalNumberOfPagesToCrawlInASession=32
chromiumDriverLocation = /usr/local/bin/chromedriver chromiumDriverLocation = /usr/local/bin/chromedriver

View File

@ -1,30 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<appender name="SAVE-TO-FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log</file>
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<Pattern>%d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n</Pattern>
</encoder>
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>/var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log</fileNamePattern>
<maxFileSize>10MB</maxFileSize>
<maxHistory>10</maxHistory>
<totalSizeCap>100MB</totalSizeCap>
</rollingPolicy>
</appender>
<logger name="org.springframework" level="INFO" additivity="false">
<appender-ref ref="SAVE-TO-FILE" />
</logger>
<logger name="root" level="INFO" additivity="false">
<appender-ref ref="SAVE-TO-FILE" />
</logger>
<logger name="eu.dnetlib" level="INFO" additivity="false">
<appender-ref ref="SAVE-TO-FILE" />
</logger>
<logger name="eu.dnetlib.bmuse_webapp" level="INFO" additivity="false">
<appender-ref ref="SAVE-TO-FILE" />
</logger>
<logger name="hwu.elixir" level="INFO" additivity="false">
<appender-ref ref="SAVE-TO-FILE" />
</logger>
</configuration>

38
pom.xml
View File

@ -278,6 +278,43 @@
</exclusions> </exclusions>
</dependency> </dependency>
<!-- Bioschemas BMUSE -->
<dependency>
<groupId>hwu.elixir</groupId>
<artifactId>bmuse-core</artifactId>
<version>0.5.4</version>
</dependency>
<dependency>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-core</artifactId>
<version>2.3</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-rio-rdfxml</artifactId>
<version>3.7.1</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-model</artifactId>
<version>3.7.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.6</version>
</dependency>
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>
@ -418,5 +455,6 @@
<javamelody.version>1.71.0</javamelody.version> <javamelody.version>1.71.0</javamelody.version>
<maven.javadoc.failOnError>false</maven.javadoc.failOnError> <maven.javadoc.failOnError>false</maven.javadoc.failOnError>
<dockerfile-maven-version>1.3.6</dockerfile-maven-version> <dockerfile-maven-version>1.3.6</dockerfile-maven-version>
<bioschemas-commons-io-version>2.6</bioschemas-commons-io-version>
</properties> </properties>
</project> </project>