diff --git a/apps/bioschemas-api/pom.xml b/apps/bioschemas-api/pom.xml
index 4bcb0408..faa25156 100644
--- a/apps/bioschemas-api/pom.xml
+++ b/apps/bioschemas-api/pom.xml
@@ -12,62 +12,38 @@
bioschemas-api
-
- org.springframework.boot
- spring-boot-starter-test
- test
-
hwu.elixir
bmuse-core
- 0.5.4
-
- org.freemarker
- freemarker
- 2.3.27-incubating
-
-
org.apache.any23
apache-any23-core
- 2.3
org.eclipse.rdf4j
rdf4j-rio-rdfxml
- 3.7.1
org.eclipse.rdf4j
rdf4j-model
- 3.7.1
-
org.jsoup
jsoup
- 1.13.1
org.seleniumhq.selenium
selenium-java
- 3.141.59
commons-io
commons-io
- 2.6
+ ${bioschemas-commons-io-version}
commons-validator
commons-validator
- 1.6
-
-
- ch.qos.logback
- logback-classic
- 1.2.3
diff --git a/apps/bioschemas-api/pom.xml.original b/apps/bioschemas-api/pom.xml.original
deleted file mode 100644
index f79644f5..00000000
--- a/apps/bioschemas-api/pom.xml.original
+++ /dev/null
@@ -1,173 +0,0 @@
-
-
-
- org.springframework.boot
- spring-boot-starter-parent
- 2.1.3.RELEASE
-
-
- 4.0.0
- eu.dnetlib
- dnet-bmuse-webapp
- jar
- 1.0.0-SNAPSHOT
-
- scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk
- https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp
-
-
- jenkins
- https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/
-
-
-
- dnet5-releases
- D-Net 5 Releases
- http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases
- default
-
-
-
-
-
-
-
- dnet-deps
- D-Net Dependencies
- https://maven.d4science.org/nexus/content/repositories/dnet-deps/
-
- true
-
-
- false
-
- default
-
-
- dnet5-releases
- D-Net 5 Releases
- http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases
- default
-
- false
-
-
-
- dnet5-snapshots
- D-Net 5 Snapshots
- http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots
- default
-
- true
-
-
-
-
-
-
- junit
- junit
- 4.13-rc-1
- test
-
-
- org.springframework.boot
- spring-boot-starter-test
- test
-
-
- org.springframework.boot
- spring-boot-autoconfigure
-
-
- org.springframework.boot
- spring-boot
-
-
- org.springframework.boot
- spring-boot-starter-web
-
-
- hwu.elixir
- bmuse-core
- 0.5.4
-
-
- org.freemarker
- freemarker
- 2.3.27-incubating
-
-
-
- org.apache.any23
- apache-any23-core
- 2.3
-
-
- org.eclipse.rdf4j
- rdf4j-rio-rdfxml
- 3.7.1
-
-
- org.eclipse.rdf4j
- rdf4j-model
- 3.7.1
-
-
-
- org.jsoup
- jsoup
- 1.13.1
-
-
- org.seleniumhq.selenium
- selenium-java
- 3.141.59
-
-
- commons-io
- commons-io
- 2.6
-
-
- commons-validator
- commons-validator
- 1.6
-
-
- ch.qos.logback
- logback-classic
- 1.2.3
-
-
-
-
-
-
-
- org.springframework.boot
- spring-boot-maven-plugin
-
- true
-
-
-
-
-
-
- 1.8
- false
-
-
-
-
- java8-doclint-disabled
-
- [1.8,)
-
-
- -Xdoclint:none
-
-
-
-
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java
index 634f9172..529980e9 100644
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java
@@ -1,13 +1,7 @@
package eu.dnetlib.bmuse_webapp;
-import org.springframework.boot.web.client.RestTemplateBuilder;
-import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Profile;
-import org.springframework.web.client.RestTemplate;
-
-import freemarker.cache.ClassTemplateLoader;
-import freemarker.template.TemplateExceptionHandler;
/**
* @author enrico.ottonello
@@ -17,29 +11,4 @@ import freemarker.template.TemplateExceptionHandler;
@Configuration
public class AppConfigGarr {
- @Bean
- public RestTemplate jrrRestTemplate(){
- //TODO: move configuration here from CatalogueRegistrator?
- return new RestTemplateBuilder().build();
- }
-
-
- @Bean
- public freemarker.template.Configuration freemarkerConfig(){
- freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27);
- ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql");
- config.setTemplateLoader(ctl);
- config.setDefaultEncoding("UTF-8");
- // Sets how errors will appear.
- // During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better.
- config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER);
-
- // Don't log exceptions inside FreeMarker that it will thrown at you anyway:
- config.setLogTemplateExceptions(false);
-
- // Wrap unchecked exceptions thrown during template processing into TemplateException-s.
- config.setWrapUncheckedExceptions(true);
-
- return config;
- }
}
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java
index 3b1ab451..f87578c8 100644
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java
@@ -4,31 +4,24 @@ import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord;
import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper;
import eu.dnetlib.bmuse_webapp.scraper.ScrapeState;
import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread;
-import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper;
import eu.dnetlib.bmuse_webapp.utils.UrlParser;
-import hwu.elixir.utils.Helpers;
-import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import java.io.*;
-import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
-import java.util.*;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.Date;
+import java.util.List;
+import java.util.Objects;
+import java.util.Properties;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape.
- * Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL)
- * and adds provenance to the CrawlRecord.
- *
*
*/
public class ServiceScrapeDriver {
@@ -96,31 +89,15 @@ public class ServiceScrapeDriver {
logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
while (pagesCounter < totalNumberOfPagesToCrawlInASession) {
- logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
+ logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession);
ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion);
scrape1.setName("S1");
-
-// ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
-// scrape2.setName("S2");
-//
-// ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
-// scrape3.setName("S3");
-//
-// ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder);
-// scrape4.setName("S4");
-
scrape1.start();
-// scrape2.start();
-// scrape3.start();
-// scrape4.start();
long startTime = System.nanoTime();
try {
scrape1.join();
-// scrape2.join();
-// scrape3.join();
-// scrape4.join();
} catch (InterruptedException e) {
logger.error("Exception waiting on thread");
e.printStackTrace();
@@ -135,21 +112,13 @@ public class ServiceScrapeDriver {
}
logger.debug("Value of isFileWritten: " + scrape1.isFileWritten());
-
long endTime = System.nanoTime();
long timeElapsed = endTime - startTime;
- logger.info("Time in s to complete: " + timeElapsed / 1e+9);
-
- updateDatabase(scrapeState);
+ logger.debug("Time in s to complete: " + timeElapsed / 1e+9);
pagesCounter += numberOfPagesToCrawlInALoop;
-
-
- logger.info("ENDED loop");
+ logger.debug("ENDED loop");
}
-// Map nquads = scrapeState.getNquadsConcurrentHashMap();
-// logger.info("Available nquads records: "+nquads.size() );
-
logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis())));
File output = new File(outputFolder.concat("/").concat(outputFilename));
@@ -173,20 +142,7 @@ public class ServiceScrapeDriver {
}
}
bufferedWriter.close();
- logger.info(" dump to "+output.getAbsolutePath());
- }
-
- /**
- *
- * @param scrapeState State of scrape at end
- * @return true if success / false otherwise
- * @see ScrapeState
- * @see CrawlRecord
- */
- private boolean updateDatabase(ScrapeState scrapeState) {
- boolean result = false;
-
- return result;
+ logger.info(" Data stored into "+output.getAbsolutePath());
}
/**
@@ -256,6 +212,4 @@ public class ServiceScrapeDriver {
String[] parts = pageUrl.split("/");
return parts[parts.length - 1];
}
-
-
}
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java
index 22beeb97..2339a8b3 100644
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java
@@ -3,21 +3,19 @@ package eu.dnetlib.bmuse_webapp.publisher;
import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
import eu.dnetlib.common.controller.AbstractDnetController;
import org.apache.commons.io.FileUtils;
-import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.tomcat.jni.FileInfo;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.web.bind.annotation.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RequestMethod;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
import javax.servlet.http.HttpServletResponse;
import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
-import java.util.List;
/**
* @author enrico.ottonello
@@ -28,30 +26,31 @@ import java.util.List;
@RequestMapping("/api")
public class BMUSEWebappController extends AbstractDnetController {
- private static final Log log = LogFactory.getLog(BMUSEWebappController.class);
+ @Value("${outputFolder}")
+ private String outputFolder;
+ @Value("${outputDataPattern}")
+ private String outputDataPattern;
- @RequestMapping(value = "/version", method = RequestMethod.GET)
- public String version() throws BMUSEWebappException {
- return "1.0.0-SNAPSHOT";
- }
+ private static Logger logger = LoggerFactory.getLogger(BMUSEWebappController.class);
- @RequestMapping(value = "/scrape", method = RequestMethod.GET)
- public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
+ @RequestMapping(value = "/startScraping", method = RequestMethod.GET)
+ public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException {
+
+ logger.info(" datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
- log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl);
-// String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz
-// String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz
-// String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz
String sitemapUrlKey = "loc";
- String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt");
+ String outputFilename = datasourceKey.concat(getOutputDataPattern());
ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename);
service.start();
return "started";
}
- @RequestMapping(value = "/nquads", method = RequestMethod.GET)
- public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException {
- LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8");
+ @RequestMapping(value = "/getNQuads", method = RequestMethod.GET)
+ public String getNQuads(@RequestParam final String datasourceKey, HttpServletResponse response) throws BMUSEWebappException, IOException {
+
+ logger.info(" datasourceKey: "+datasourceKey);
+
+ LineIterator it = FileUtils.lineIterator(new File(getOutputFolder().concat("/").concat(datasourceKey).concat(getOutputDataPattern())), "UTF-8");
try {
while (it.hasNext()) {
String line = it.nextLine();
@@ -62,4 +61,12 @@ public class BMUSEWebappController extends AbstractDnetController {
}
return "";
}
+
+ public String getOutputFolder() {
+ return outputFolder;
+ }
+
+ public String getOutputDataPattern() {
+ return outputDataPattern;
+ }
}
\ No newline at end of file
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java
index 4203f18a..763c9f64 100644
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java
@@ -1,7 +1,6 @@
package eu.dnetlib.bmuse_webapp.scraper;
-import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver;
import hwu.elixir.scrape.exceptions.MissingMarkupException;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
import org.apache.any23.Any23;
@@ -12,8 +11,6 @@ import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.output.ByteArrayOutputStream;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
@@ -27,7 +24,7 @@ import java.io.StringWriter;
public class BMUSEScraper extends ScraperFilteredCore {
- private static final Log logger = LogFactory.getLog(BMUSEScraper.class);
+ private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class);
public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
logger.debug(url + " > scraping");
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java
index 7fe1ef88..27847ad3 100644
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java
@@ -6,13 +6,8 @@ import hwu.elixir.scrape.exceptions.CannotWriteException;
import hwu.elixir.scrape.exceptions.FourZeroFourException;
import hwu.elixir.scrape.exceptions.JsonLDInspectionException;
import hwu.elixir.scrape.exceptions.MissingMarkupException;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-//import org.apache.commons.logging.Log;
-//import org.apache.commons.logging.LogFactory;
-//import org.slf4j.Logger;
-//import org.slf4j.LoggerFactory;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.util.Date;
/**
@@ -28,7 +23,7 @@ public class ScrapeThread extends Thread {
private boolean fileWritten = true;
private int scrapeVersion = 1;
- private static final Log logger = LogFactory.getLog(ScrapeThread.class);
+ private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class);
/**
* Sets up a thread for actually scrapping.
@@ -68,8 +63,7 @@ public class ScrapeThread extends Thread {
try {
String nquads = process.getNQUADSFromUrl(record.getUrl(), true);
-// scrapeState.addNquads(record.getName(), nquads);
- logger.info("downloaded "+record.getUrl());
+ logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.pagesLeftToScrape());
record.setNquads(CompressorUtil.compressValue(nquads));
if (!nquads.isEmpty()) {
scrapeState.addSuccessfulScrapedURL(record);
diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java
index b13e9cb6..fbcc0483 100644
--- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java
+++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java
@@ -3,7 +3,6 @@ package eu.dnetlib.bmuse_webapp.scraper;
import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape;
import hwu.elixir.scrape.exceptions.*;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
-import org.apache.commons.lang.time.DateUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -20,7 +19,7 @@ import org.slf4j.LoggerFactory;
*/
public class ServiceScraper extends ScraperFilteredCore {
- private static Logger logger = LoggerFactory.getLogger(System.class.getName());
+ private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class);
private StatusOfScrape status= null;
diff --git a/apps/bioschemas-api/src/main/resources/application.properties b/apps/bioschemas-api/src/main/resources/application.properties
index ce5f349b..7bec4ba6 100644
--- a/apps/bioschemas-api/src/main/resources/application.properties
+++ b/apps/bioschemas-api/src/main/resources/application.properties
@@ -3,7 +3,7 @@ server.port=8281
spring.profiles.active=garr
-logging.file.name = /var/log/springboot/9480/oa_organizations.log
+logging.file.name = /var/log/bioschemas/log/bioschemas-api.log
maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml
@@ -17,7 +17,8 @@ management.endpoints.web.path-mapping.prometheus = metrics
management.endpoints.web.path-mapping.health = health
waitTime=5
-outputFolder=/Users/enrico.ottonello/data/bmuse-output
+outputFolder=/data
+outputDataPattern=_base64_gzipped_nquads.txt
numberOfPagesToCrawlInALoop=8
totalNumberOfPagesToCrawlInASession=32
chromiumDriverLocation = /usr/local/bin/chromedriver
diff --git a/apps/bioschemas-api/src/main/resources/logback-spring.xml b/apps/bioschemas-api/src/main/resources/logback-spring.xml
deleted file mode 100644
index 3c5e86fe..00000000
--- a/apps/bioschemas-api/src/main/resources/logback-spring.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-
-
-
- /var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log
-
- %d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n
-
-
- /var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log
- 10MB
- 10
- 100MB
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index ea4c6e21..e3d03af5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -278,6 +278,43 @@
+
+
+ hwu.elixir
+ bmuse-core
+ 0.5.4
+
+
+ org.apache.any23
+ apache-any23-core
+ 2.3
+
+
+ org.eclipse.rdf4j
+ rdf4j-rio-rdfxml
+ 3.7.1
+
+
+ org.eclipse.rdf4j
+ rdf4j-model
+ 3.7.1
+
+
+ org.jsoup
+ jsoup
+ 1.13.1
+
+
+ org.seleniumhq.selenium
+ selenium-java
+ 3.141.59
+
+
+ commons-validator
+ commons-validator
+ 1.6
+
+
@@ -418,5 +455,6 @@
1.71.0
false
1.3.6
+ 2.6