package eu.dnetlib.bioschemas.api.scraper; import hwu.elixir.scrape.exceptions.MissingMarkupException; import hwu.elixir.scrape.scraper.ScraperFilteredCore; import org.apache.any23.Any23; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.source.DocumentSource; import org.apache.any23.source.StringDocumentSource; import org.apache.any23.writer.NTriplesWriter; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; import org.apache.commons.io.output.ByteArrayOutputStream; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.Rio; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.StringWriter; public class BMUSEScraper extends ScraperFilteredCore { private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class); public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception { logger.debug(url + " > scraping"); url = fixURL(url); String html = ""; // The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information // (dynamic and static respectively) if (dynamic) { html = wrapHTMLExtraction(url); } else { html = wrapHTMLExtractionStatic(url); } if (html == null || html.contentEquals("")) throw new Exception("empty html"); html = injectId(html, url); logger.debug(url + " > html scraped from " + url); DocumentSource source = new StringDocumentSource(html, url); String n3 = html2Triples(source, url); if (n3 == null) { throw new MissingMarkupException(url); } logger.debug(url + " > processing triples"); IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI()); Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l); if (updatedModel == null) { throw new Exception("rdf model null"); } logger.debug(url + " > generating nquads"); try (StringWriter jsonLDWriter = new StringWriter()) { Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS); logger.debug(url + " > nquads generated"); return jsonLDWriter.toString(); } catch (Exception e) { throw e; } } private String html2Triples(DocumentSource source, String url) throws Exception { Any23 runner = new Any23(); try (ByteArrayOutputStream out = new ByteArrayOutputStream(); TripleHandler handler = new NTriplesWriter(out);) { runner.extract(source, handler); return out.toString("UTF-8"); } catch (ExtractionException e) { logger.error("Cannot extract triples", e); } catch (IOException e1) { logger.error(" IO error whilst extracting triples", e1); } catch (TripleHandlerException e2) { logger.error("TripleHanderException", e2); } return null; } }