dnet-hadoop/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java

86 lines
2.6 KiB
Java

package eu.dnetlib.dhp.bmuse.utils;
import java.io.StringWriter;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.Rio;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import hwu.elixir.scrape.exceptions.*;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
public class BMUSEScraper extends ScraperFilteredCore {
private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());
public String scrapeUrl(String url, Boolean dynamic)
throws MissingMarkupException, FourZeroFourException {
url = fixURL(url);
String html = "";
// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
// (dynamic and static respectively)
if (dynamic) {
html = wrapHTMLExtraction(url);
} else {
html = wrapHTMLExtractionStatic(url);
}
if (html == null || html.contentEquals(""))
return new String("empty html");
if (logger.isTraceEnabled()) {
logger.trace("Read following html ==============================================================");
logger.trace(html);
}
try {
html = injectId(html, url);
if (logger.isTraceEnabled()) {
logger
.trace(
"Same HTML after injecting ID ==============================================================");
logger.trace(html);
}
} catch (MissingHTMLException | JsonLDInspectionException e) {
logger.error(e.toString());
return e.getMessage();
}
logger.info("HTML: " + html);
DocumentSource source = new StringDocumentSource(html, url);
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
String n3 = getTriplesInNTriples(source);
if (n3 == null)
throw new MissingMarkupException(url);
Model updatedModel = null;
try {
updatedModel = processTriples(n3, sourceIRI, 0l);
} catch (NTriplesParsingException e1) {
logger
.error(
"Failed to process triples into model; the NTriples generated from the URL (" + url
+ ") could not be parsed into a model.");
return e1.getMessage();
}
if (updatedModel == null)
return new String("rdf model null");
try (StringWriter jsonLDWriter = new StringWriter()) {
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
return jsonLDWriter.toString();
} catch (Exception e) {
logger.error("Problem writing jsonld for " + url, e);
return e.getMessage();
}
}
}