diff --git a/apps/bioschemas-api/pom.xml b/apps/bioschemas-api/pom.xml index 4bcb0408..faa25156 100644 --- a/apps/bioschemas-api/pom.xml +++ b/apps/bioschemas-api/pom.xml @@ -12,62 +12,38 @@ bioschemas-api - - org.springframework.boot - spring-boot-starter-test - test - hwu.elixir bmuse-core - 0.5.4 - - org.freemarker - freemarker - 2.3.27-incubating - - org.apache.any23 apache-any23-core - 2.3 org.eclipse.rdf4j rdf4j-rio-rdfxml - 3.7.1 org.eclipse.rdf4j rdf4j-model - 3.7.1 - org.jsoup jsoup - 1.13.1 org.seleniumhq.selenium selenium-java - 3.141.59 commons-io commons-io - 2.6 + ${bioschemas-commons-io-version} commons-validator commons-validator - 1.6 - - - ch.qos.logback - logback-classic - 1.2.3 diff --git a/apps/bioschemas-api/pom.xml.original b/apps/bioschemas-api/pom.xml.original deleted file mode 100644 index f79644f5..00000000 --- a/apps/bioschemas-api/pom.xml.original +++ /dev/null @@ -1,173 +0,0 @@ - - - - org.springframework.boot - spring-boot-starter-parent - 2.1.3.RELEASE - - - 4.0.0 - eu.dnetlib - dnet-bmuse-webapp - jar - 1.0.0-SNAPSHOT - - scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet50/modules/dnet-bmuse-webapp/trunk - https://github.com/spring-projects/spring-boot/spring-boot-starter-parent/dnet-bmuse-webapp - - - jenkins - https://jenkins-dnet.d4science.org/view/DNet50/job/DSL50_dnet-bmuse-webapp/ - - - - dnet5-releases - D-Net 5 Releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases - default - - - - - - - - dnet-deps - D-Net Dependencies - https://maven.d4science.org/nexus/content/repositories/dnet-deps/ - - true - - - false - - default - - - dnet5-releases - D-Net 5 Releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-releases - default - - false - - - - dnet5-snapshots - D-Net 5 Snapshots - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet5-snapshots - default - - true - - - - - - - junit - junit - 4.13-rc-1 - test - - - org.springframework.boot - spring-boot-starter-test - test - - - org.springframework.boot - spring-boot-autoconfigure - - - org.springframework.boot - spring-boot - - - org.springframework.boot - spring-boot-starter-web - - - hwu.elixir - bmuse-core - 0.5.4 - - - org.freemarker - freemarker - 2.3.27-incubating - - - - org.apache.any23 - apache-any23-core - 2.3 - - - org.eclipse.rdf4j - rdf4j-rio-rdfxml - 3.7.1 - - - org.eclipse.rdf4j - rdf4j-model - 3.7.1 - - - - org.jsoup - jsoup - 1.13.1 - - - org.seleniumhq.selenium - selenium-java - 3.141.59 - - - commons-io - commons-io - 2.6 - - - commons-validator - commons-validator - 1.6 - - - ch.qos.logback - logback-classic - 1.2.3 - - - - - - - - org.springframework.boot - spring-boot-maven-plugin - - true - - - - - - - 1.8 - false - - - - - java8-doclint-disabled - - [1.8,) - - - -Xdoclint:none - - - - \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java index 634f9172..529980e9 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/AppConfigGarr.java @@ -1,13 +1,7 @@ package eu.dnetlib.bmuse_webapp; -import org.springframework.boot.web.client.RestTemplateBuilder; -import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Profile; -import org.springframework.web.client.RestTemplate; - -import freemarker.cache.ClassTemplateLoader; -import freemarker.template.TemplateExceptionHandler; /** * @author enrico.ottonello @@ -17,29 +11,4 @@ import freemarker.template.TemplateExceptionHandler; @Configuration public class AppConfigGarr { - @Bean - public RestTemplate jrrRestTemplate(){ - //TODO: move configuration here from CatalogueRegistrator? - return new RestTemplateBuilder().build(); - } - - - @Bean - public freemarker.template.Configuration freemarkerConfig(){ - freemarker.template.Configuration config = new freemarker.template.Configuration(freemarker.template.Configuration.VERSION_2_3_27); - ClassTemplateLoader ctl = new ClassTemplateLoader(getClass(), "/eu/dnetlib/bmuse_webapp/sparql"); - config.setTemplateLoader(ctl); - config.setDefaultEncoding("UTF-8"); - // Sets how errors will appear. - // During web page *development* TemplateExceptionHandler.HTML_DEBUG_HANDLER is better. - config.setTemplateExceptionHandler(TemplateExceptionHandler.RETHROW_HANDLER); - - // Don't log exceptions inside FreeMarker that it will thrown at you anyway: - config.setLogTemplateExceptions(false); - - // Wrap unchecked exceptions thrown during template processing into TemplateException-s. - config.setWrapUncheckedExceptions(true); - - return config; - } } \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java index 3b1ab451..f87578c8 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/ServiceScrapeDriver.java @@ -4,31 +4,24 @@ import eu.dnetlib.bmuse_webapp.crawl.CrawlRecord; import eu.dnetlib.bmuse_webapp.scraper.BMUSEScraper; import eu.dnetlib.bmuse_webapp.scraper.ScrapeState; import eu.dnetlib.bmuse_webapp.scraper.ScrapeThread; -import eu.dnetlib.bmuse_webapp.scraper.ServiceScraper; import eu.dnetlib.bmuse_webapp.utils.UrlParser; -import hwu.elixir.utils.Helpers; -import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.*; -import java.nio.charset.Charset; import java.text.SimpleDateFormat; -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; +import java.util.Date; +import java.util.List; +import java.util.Objects; +import java.util.Properties; import java.util.stream.Collectors; import java.util.stream.Stream; /** * Runs the scrape. Collect a list of URLs (in the form of CrawlRecords) to scrape. - * Scrapes them in turn, writes the (bio)schema markup extracted to a file (1 file per URL) - * and adds provenance to the CrawlRecord. - * * */ public class ServiceScrapeDriver { @@ -96,31 +89,15 @@ public class ServiceScrapeDriver { logger.info("STARTING CRAWL: " + formatter.format(new Date(System.currentTimeMillis()))); while (pagesCounter < totalNumberOfPagesToCrawlInASession) { - logger.info(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession); + logger.debug(pagesCounter + " scraped of " + totalNumberOfPagesToCrawlInASession); ScrapeThread scrape1 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, scrapeVersion); scrape1.setName("S1"); - -// ScrapeThread scrape2 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder); -// scrape2.setName("S2"); -// -// ScrapeThread scrape3 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder); -// scrape3.setName("S3"); -// -// ScrapeThread scrape4 = new ScrapeThread(new BMUSEScraper(), scrapeState, waitTime, outputFolder); -// scrape4.setName("S4"); - scrape1.start(); -// scrape2.start(); -// scrape3.start(); -// scrape4.start(); long startTime = System.nanoTime(); try { scrape1.join(); -// scrape2.join(); -// scrape3.join(); -// scrape4.join(); } catch (InterruptedException e) { logger.error("Exception waiting on thread"); e.printStackTrace(); @@ -135,21 +112,13 @@ public class ServiceScrapeDriver { } logger.debug("Value of isFileWritten: " + scrape1.isFileWritten()); - long endTime = System.nanoTime(); long timeElapsed = endTime - startTime; - logger.info("Time in s to complete: " + timeElapsed / 1e+9); - - updateDatabase(scrapeState); + logger.debug("Time in s to complete: " + timeElapsed / 1e+9); pagesCounter += numberOfPagesToCrawlInALoop; - - - logger.info("ENDED loop"); + logger.debug("ENDED loop"); } -// Map nquads = scrapeState.getNquadsConcurrentHashMap(); -// logger.info("Available nquads records: "+nquads.size() ); - logger.info("ENDING CRAWL: " + formatter.format(new Date(System.currentTimeMillis()))); File output = new File(outputFolder.concat("/").concat(outputFilename)); @@ -173,20 +142,7 @@ public class ServiceScrapeDriver { } } bufferedWriter.close(); - logger.info(" dump to "+output.getAbsolutePath()); - } - - /** - * - * @param scrapeState State of scrape at end - * @return true if success / false otherwise - * @see ScrapeState - * @see CrawlRecord - */ - private boolean updateDatabase(ScrapeState scrapeState) { - boolean result = false; - - return result; + logger.info(" Data stored into "+output.getAbsolutePath()); } /** @@ -256,6 +212,4 @@ public class ServiceScrapeDriver { String[] parts = pageUrl.split("/"); return parts[parts.length - 1]; } - - } diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java index 22beeb97..2339a8b3 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/publisher/BMUSEWebappController.java @@ -3,21 +3,19 @@ package eu.dnetlib.bmuse_webapp.publisher; import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver; import eu.dnetlib.common.controller.AbstractDnetController; import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.tomcat.jni.FileInfo; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.web.bind.annotation.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestMethod; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; import javax.servlet.http.HttpServletResponse; import java.io.File; -import java.io.FileInputStream; -import java.io.FileReader; import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.util.List; /** * @author enrico.ottonello @@ -28,30 +26,31 @@ import java.util.List; @RequestMapping("/api") public class BMUSEWebappController extends AbstractDnetController { - private static final Log log = LogFactory.getLog(BMUSEWebappController.class); + @Value("${outputFolder}") + private String outputFolder; + @Value("${outputDataPattern}") + private String outputDataPattern; - @RequestMapping(value = "/version", method = RequestMethod.GET) - public String version() throws BMUSEWebappException { - return "1.0.0-SNAPSHOT"; - } + private static Logger logger = LoggerFactory.getLogger(BMUSEWebappController.class); - @RequestMapping(value = "/scrape", method = RequestMethod.GET) - public String scrape(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException { + @RequestMapping(value = "/startScraping", method = RequestMethod.GET) + public String startScraping(@RequestParam final String datasourceKey, @RequestParam final String sitemapUrl) throws BMUSEWebappException, IOException { + + logger.info(" datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl); - log.info("datasourceKey: "+datasourceKey+" sitemapUrl:"+sitemapUrl); -// String sitemapUrl = "https://mobidb.org/sitemap2.xml.gz"; scrape?datasourceKey=mobidb&sitemapUrl=https%3A%2F%2Fmobidb.org%2Fsitemap2.xml.gz -// String sitemapUrl = "https://proteinensemble.org/sitemap2.xml.gz"; scrape?datasourceKey=ped&sitemapUrl=https%3A%2F%2Fproteinensemble.org%2Fsitemap2.xml.gz -// String sitemapUrl = "https://disprot.org/sitemap2.xml.gz"; scrape?datasourceKey=disprot&sitemapUrl=https%3A%2F%2Fdisprot.org%2Fsitemap2.xml.gz String sitemapUrlKey = "loc"; - String outputFilename = datasourceKey.concat("_base64_gzipped_nquads.txt"); + String outputFilename = datasourceKey.concat(getOutputDataPattern()); ServiceScrapeDriver service = new ServiceScrapeDriver(sitemapUrl, sitemapUrlKey, null, outputFilename); service.start(); return "started"; } - @RequestMapping(value = "/nquads", method = RequestMethod.GET) - public String nquads(HttpServletResponse response) throws BMUSEWebappException, IOException { - LineIterator it = FileUtils.lineIterator(new File("/Users/enrico.ottonello/data/bmuse-output/output.nq"), "UTF-8"); + @RequestMapping(value = "/getNQuads", method = RequestMethod.GET) + public String getNQuads(@RequestParam final String datasourceKey, HttpServletResponse response) throws BMUSEWebappException, IOException { + + logger.info(" datasourceKey: "+datasourceKey); + + LineIterator it = FileUtils.lineIterator(new File(getOutputFolder().concat("/").concat(datasourceKey).concat(getOutputDataPattern())), "UTF-8"); try { while (it.hasNext()) { String line = it.nextLine(); @@ -62,4 +61,12 @@ public class BMUSEWebappController extends AbstractDnetController { } return ""; } + + public String getOutputFolder() { + return outputFolder; + } + + public String getOutputDataPattern() { + return outputDataPattern; + } } \ No newline at end of file diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java index 4203f18a..763c9f64 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/BMUSEScraper.java @@ -1,7 +1,6 @@ package eu.dnetlib.bmuse_webapp.scraper; -import eu.dnetlib.bmuse_webapp.ServiceScrapeDriver; import hwu.elixir.scrape.exceptions.MissingMarkupException; import hwu.elixir.scrape.scraper.ScraperFilteredCore; import org.apache.any23.Any23; @@ -12,8 +11,6 @@ import org.apache.any23.writer.NTriplesWriter; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; import org.apache.commons.io.output.ByteArrayOutputStream; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; @@ -27,7 +24,7 @@ import java.io.StringWriter; public class BMUSEScraper extends ScraperFilteredCore { - private static final Log logger = LogFactory.getLog(BMUSEScraper.class); + private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class); public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception { logger.debug(url + " > scraping"); diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java index 7fe1ef88..27847ad3 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ScrapeThread.java @@ -6,13 +6,8 @@ import hwu.elixir.scrape.exceptions.CannotWriteException; import hwu.elixir.scrape.exceptions.FourZeroFourException; import hwu.elixir.scrape.exceptions.JsonLDInspectionException; import hwu.elixir.scrape.exceptions.MissingMarkupException; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -//import org.apache.commons.logging.Log; -//import org.apache.commons.logging.LogFactory; -//import org.slf4j.Logger; -//import org.slf4j.LoggerFactory; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.Date; /** @@ -28,7 +23,7 @@ public class ScrapeThread extends Thread { private boolean fileWritten = true; private int scrapeVersion = 1; - private static final Log logger = LogFactory.getLog(ScrapeThread.class); + private static Logger logger = LoggerFactory.getLogger(ScrapeThread.class); /** * Sets up a thread for actually scrapping. @@ -68,8 +63,7 @@ public class ScrapeThread extends Thread { try { String nquads = process.getNQUADSFromUrl(record.getUrl(), true); -// scrapeState.addNquads(record.getName(), nquads); - logger.info("downloaded "+record.getUrl()); + logger.info("downloaded "+record.getUrl() + " leftToScrape:" + scrapeState.pagesLeftToScrape()); record.setNquads(CompressorUtil.compressValue(nquads)); if (!nquads.isEmpty()) { scrapeState.addSuccessfulScrapedURL(record); diff --git a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java index b13e9cb6..fbcc0483 100644 --- a/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java +++ b/apps/bioschemas-api/src/main/java/eu/dnetlib/bmuse_webapp/scraper/ServiceScraper.java @@ -3,7 +3,6 @@ package eu.dnetlib.bmuse_webapp.scraper; import eu.dnetlib.bmuse_webapp.crawl.StatusOfScrape; import hwu.elixir.scrape.exceptions.*; import hwu.elixir.scrape.scraper.ScraperFilteredCore; -import org.apache.commons.lang.time.DateUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,7 +19,7 @@ import org.slf4j.LoggerFactory; */ public class ServiceScraper extends ScraperFilteredCore { - private static Logger logger = LoggerFactory.getLogger(System.class.getName()); + private static Logger logger = LoggerFactory.getLogger(ServiceScraper.class); private StatusOfScrape status= null; diff --git a/apps/bioschemas-api/src/main/resources/application.properties b/apps/bioschemas-api/src/main/resources/application.properties index ce5f349b..7bec4ba6 100644 --- a/apps/bioschemas-api/src/main/resources/application.properties +++ b/apps/bioschemas-api/src/main/resources/application.properties @@ -3,7 +3,7 @@ server.port=8281 spring.profiles.active=garr -logging.file.name = /var/log/springboot/9480/oa_organizations.log +logging.file.name = /var/log/bioschemas/log/bioschemas-api.log maven.pom.path = /META-INF/maven/eu.dnetlib.dhp/bioschemas-api/effective-pom.xml @@ -17,7 +17,8 @@ management.endpoints.web.path-mapping.prometheus = metrics management.endpoints.web.path-mapping.health = health waitTime=5 -outputFolder=/Users/enrico.ottonello/data/bmuse-output +outputFolder=/data +outputDataPattern=_base64_gzipped_nquads.txt numberOfPagesToCrawlInALoop=8 totalNumberOfPagesToCrawlInASession=32 chromiumDriverLocation = /usr/local/bin/chromedriver diff --git a/apps/bioschemas-api/src/main/resources/logback-spring.xml b/apps/bioschemas-api/src/main/resources/logback-spring.xml deleted file mode 100644 index 3c5e86fe..00000000 --- a/apps/bioschemas-api/src/main/resources/logback-spring.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - /var/log/dnet-bmuse-webapp/dnet-bmuse-webapp.log - - %d{dd-MM-yyyy HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M - %msg%n - - - /var/log/dnet-bmuse-webapp/dnet-bmuse-webapp_%d{dd-MM-yyyy}_%i.log - 10MB - 10 - 100MB - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/pom.xml b/pom.xml index ea4c6e21..e3d03af5 100644 --- a/pom.xml +++ b/pom.xml @@ -278,6 +278,43 @@ + + + hwu.elixir + bmuse-core + 0.5.4 + + + org.apache.any23 + apache-any23-core + 2.3 + + + org.eclipse.rdf4j + rdf4j-rio-rdfxml + 3.7.1 + + + org.eclipse.rdf4j + rdf4j-model + 3.7.1 + + + org.jsoup + jsoup + 1.13.1 + + + org.seleniumhq.selenium + selenium-java + 3.141.59 + + + commons-validator + commons-validator + 1.6 + + @@ -418,5 +455,6 @@ 1.71.0 false 1.3.6 + 2.6