88 lines
2.9 KiB
Java
88 lines
2.9 KiB
Java
|
|
package eu.dnetlib.bioschemas.api.scraper;
|
|
|
|
import hwu.elixir.scrape.exceptions.MissingMarkupException;
|
|
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
|
import org.apache.any23.Any23;
|
|
import org.apache.any23.extractor.ExtractionException;
|
|
import org.apache.any23.source.DocumentSource;
|
|
import org.apache.any23.source.StringDocumentSource;
|
|
import org.apache.any23.writer.NTriplesWriter;
|
|
import org.apache.any23.writer.TripleHandler;
|
|
import org.apache.any23.writer.TripleHandlerException;
|
|
import org.apache.commons.io.output.ByteArrayOutputStream;
|
|
import org.eclipse.rdf4j.model.IRI;
|
|
import org.eclipse.rdf4j.model.Model;
|
|
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
|
|
import org.eclipse.rdf4j.rio.RDFFormat;
|
|
import org.eclipse.rdf4j.rio.Rio;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.io.IOException;
|
|
import java.io.StringWriter;
|
|
|
|
public class BMUSEScraper extends ScraperFilteredCore {
|
|
|
|
private static Logger logger = LoggerFactory.getLogger(BMUSEScraper.class);
|
|
|
|
public String getNQUADSFromUrl(String url, Boolean dynamic) throws Exception {
|
|
logger.debug(url + " > scraping");
|
|
url = fixURL(url);
|
|
|
|
String html = "";
|
|
// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
|
|
// (dynamic and static respectively)
|
|
|
|
if (dynamic) {
|
|
html = wrapHTMLExtraction(url);
|
|
} else {
|
|
html = wrapHTMLExtractionStatic(url);
|
|
}
|
|
|
|
if (html == null || html.contentEquals(""))
|
|
throw new Exception("empty html");
|
|
|
|
html = injectId(html, url);
|
|
|
|
logger.debug(url + " > html scraped from " + url);
|
|
DocumentSource source = new StringDocumentSource(html, url);
|
|
String n3 = html2Triples(source, url);
|
|
if (n3 == null) {
|
|
throw new MissingMarkupException(url);
|
|
}
|
|
|
|
logger.debug(url + " > processing triples");
|
|
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
|
|
Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
|
|
if (updatedModel == null) {
|
|
throw new Exception("rdf model null");
|
|
}
|
|
|
|
logger.debug(url + " > generating nquads");
|
|
try (StringWriter jsonLDWriter = new StringWriter()) {
|
|
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
|
|
logger.debug(url + " > nquads generated");
|
|
return jsonLDWriter.toString();
|
|
} catch (Exception e) {
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
private String html2Triples(DocumentSource source, String url) throws Exception {
|
|
Any23 runner = new Any23();
|
|
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
|
|
TripleHandler handler = new NTriplesWriter(out);) {
|
|
runner.extract(source, handler);
|
|
return out.toString("UTF-8");
|
|
} catch (ExtractionException e) {
|
|
logger.error("Cannot extract triples", e);
|
|
} catch (IOException e1) {
|
|
logger.error(" IO error whilst extracting triples", e1);
|
|
} catch (TripleHandlerException e2) {
|
|
logger.error("TripleHanderException", e2);
|
|
}
|
|
return null;
|
|
}
|
|
}
|