dnet-applications/apps/bioschemas-api/src/main/java/eu/dnetlib/bioschemas/api/scraper/BMUSEScraper.java


package eu.dnetlib.bioschemas.api.scraper;

import hwu.elixir.scrape.exceptions.MissingMarkupException;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.Rio;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.StringWriter;

public class BMUSEScraper extends ScraperFilteredCore {

	private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class);

	public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
		logger.debug(url + " > scraping");
		url = fixURL(url);

		String html = "";
		// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
		// (dynamic and static respectively)

		if (dynamic) {
			html = wrapHTMLExtraction(url);
		} else {
			html = wrapHTMLExtractionStatic(url);
		}

		if (html == null || html.contentEquals(""))
			throw new Exception("empty html");

		html = injectId(html, url);

		logger.debug(url + " > html scraped from " + url);
		DocumentSource source = new StringDocumentSource(html, url);
		String n3 = html2Triples(source, url);
		if (n3 == null) {
			throw new MissingMarkupException(url);
		}

		logger.debug(url + " > processing triples");
		IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
		Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
		if (updatedModel == null) {
			throw new Exception("rdf model null");
		}

		logger.debug(url + " > generating nquads");
		try (StringWriter jsonLDWriter = new StringWriter()) {
			Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
			logger.debug(url + " > nquads generated");
			return jsonLDWriter.toString();
		} catch (Exception e) {
			throw e;
		}
	}

	private String html2Triples(DocumentSource source, String url) throws Exception {
		Any23 runner = new Any23();
		try (ByteArrayOutputStream out = new ByteArrayOutputStream();
			TripleHandler handler = new NTriplesWriter(out);) {
			runner.extract(source, handler);
			return out.toString("UTF-8");
		} catch (ExtractionException e) {
			logger.error("Cannot extract triples", e);
		} catch (IOException e1) {
			logger.error(" IO error whilst extracting triples", e1);
		} catch (TripleHandlerException e2) {
			logger.error("TripleHanderException", e2);
		}
		return null;
	}
}