dnet-hadoop/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java


package eu.dnetlib.dhp.bmuse.utils;

import java.io.StringWriter;

import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.Rio;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import hwu.elixir.scrape.exceptions.*;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;

public class BMUSEScraper extends ScraperFilteredCore {

	private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());

	public String scrapeUrl(String url, Boolean dynamic)
		throws MissingMarkupException, FourZeroFourException {
		url = fixURL(url);

		String html = "";
		// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
		// (dynamic and static respectively)

		if (dynamic) {
			html = wrapHTMLExtraction(url);
		} else {
			html = wrapHTMLExtractionStatic(url);
		}

		if (html == null || html.contentEquals(""))
			return new String("empty html");
		if (logger.isTraceEnabled()) {
			logger.trace("Read following html ==============================================================");
			logger.trace(html);
		}

		try {
			html = injectId(html, url);
			if (logger.isTraceEnabled()) {
				logger
					.trace(
						"Same HTML after injecting ID ==============================================================");
				logger.trace(html);
			}
		} catch (MissingHTMLException | JsonLDInspectionException e) {
			logger.error(e.toString());
			return e.getMessage();
		}
		logger.info("HTML: " + html);
		DocumentSource source = new StringDocumentSource(html, url);
		IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());

		String n3 = getTriplesInNTriples(source);
		if (n3 == null)
			throw new MissingMarkupException(url);

		Model updatedModel = null;
		try {
			updatedModel = processTriples(n3, sourceIRI, 0l);
		} catch (NTriplesParsingException e1) {
			logger
				.error(
					"Failed to process triples into model; the NTriples generated from the URL (" + url
						+ ") could not be parsed into a model.");
			return e1.getMessage();
		}
		if (updatedModel == null)
			return new String("rdf model null");

		try (StringWriter jsonLDWriter = new StringWriter()) {
			Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
			return jsonLDWriter.toString();
		} catch (Exception e) {
			logger.error("Problem writing jsonld for " + url, e);
			return e.getMessage();
		}
	}
}