updated dhp-rdfconverter version to 1.2.5-SNAPSHOT

This commit is contained in:
Enrico Ottonello 2022-05-11 11:20:16 +02:00
parent 6fa9624c29
commit baa312f256
16 changed files with 1 additions and 823 deletions

View File

@ -1,96 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<!-- rdf 2.5.4 to 3.7.1-->

View File

@ -1,62 +0,0 @@
<description>the working path</description>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
<description>the working path</description>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
<description>the working path</description>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>

View File

@ -1,113 +0,0 @@
package eu.dnetlib.dhp.bmuse.bioschema;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.bmuse.utils.ArgumentApplicationParser;
import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper;
import eu.dnetlib.dhp.bmuse.utils.UrlParser;
public class ScrapingJob {
static Logger logger = LoggerFactory.getLogger(ScrapingJob.class);
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
final String nameNode = parser.get("nameNode");
final String workingPath = parser.get("workingPath");
final String rdfOutput = parser.get("rdfOutput");
final String sitemapUrl = parser.get("sitemapUrl");
final String sitemapURLKey = parser.get("sitemapURLKey");
final String dynamic = parser.get("dynamic");
final String maxScrapedPages = parser.get("maxScrapedPages");
Boolean dynamicValue = true;
if (Objects.nonNull(dynamic)) {
dynamicValue = Boolean.parseBoolean(dynamic);
final boolean scrapingType = dynamicValue.booleanValue();
"*************************** STARTING_SCRAPE");
BMUSEScraper scraper = new BMUSEScraper();
String url = sitemapUrl.toLowerCase();
Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
Path output = new Path(
Configuration conf = getHadoopConfiguration(nameNode);
try (SequenceFile.Writer writer = SequenceFile
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
Stream<Element> urlStream = null;
if (Objects.nonNull(maxScrapedPages)) {
urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
} else {
urlStream = urls.stream();
List<Element> sites = urlStream.collect(Collectors.toList());
logger.info("Pages available for scraping: " + sites.size());
sites.forEach(u -> {
final Text key = new Text(u.text());
String nquads;
try {
String site = u.text();
logger.debug(site + " > parsing");
nquads = scraper.scrapeUrl(site, scrapingType);
final Text value = new Text(nquads);
writer.append(key, value);
} catch (Throwable t) {
logger.error(u.text() + " -> ", t);
"*************************** ENDING_SCRAPE: ");
public static Configuration getHadoopConfiguration(String nameNode) {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", nameNode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
return conf;

View File

@ -1,94 +0,0 @@
package eu.dnetlib.dhp.bmuse.utils;
import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.*;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class ArgumentApplicationParser implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
private final Options options = new Options();
private final Map<String, String> objectMap = new HashMap<>();
private final List<String> compressedValues = new ArrayList<>();
public ArgumentApplicationParser(final String json_configuration) throws IOException {
final ObjectMapper mapper = new ObjectMapper();
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
public ArgumentApplicationParser(final OptionsParameter[] configuration) {
private void createOptionMap(final OptionsParameter[] configuration) {
conf -> {
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
if (conf.isCompressed()) {
return o;
public static String decompressValue(final String abstractCompressed) {
try {
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
final StringWriter stringWriter = new StringWriter();
IOUtils.copy(gis, stringWriter);
return stringWriter.toString();
} catch (IOException e) {
log.error("Wrong value to decompress: {}", abstractCompressed);
throw new IllegalArgumentException(e);
public static String compressArgument(final String value) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out);
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
public void parseArgument(final String[] args) throws ParseException {
CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse(options, args);
it -> objectMap
? decompressValue(it.getValue())
: it.getValue()));
public String get(final String key) {
return objectMap.get(key);
public Map<String, String> getObjectMap() {
return objectMap;

View File

@ -1,91 +0,0 @@
package eu.dnetlib.dhp.bmuse.utils;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.Rio;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import hwu.elixir.scrape.exceptions.*;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
public class BMUSEScraper extends ScraperFilteredCore {
private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());
public String scrapeUrl(String url, Boolean dynamic) throws Exception {
logger.debug(url + " > scraping");
url = fixURL(url);
String html = "";
// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
// (dynamic and static respectively)
if (dynamic) {
html = wrapHTMLExtraction(url);
} else {
html = wrapHTMLExtractionStatic(url);
if (html == null || html.contentEquals(""))
throw new Exception("empty html");
html = injectId(html, url);
logger.debug(url + " > html scraped from " + url);
DocumentSource source = new StringDocumentSource(html, url);
String n3 = html2Triples(source, url);
if (n3 == null) {
throw new MissingMarkupException(url);
logger.debug(url + " > processing triples");
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
if (updatedModel == null) {
throw new Exception("rdf model null");
logger.debug(url + " > generating nquads");
try (StringWriter jsonLDWriter = new StringWriter()) {
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
logger.debug(url + " > nquads generated");
return jsonLDWriter.toString();
} catch (Exception e) {
throw e;
private String html2Triples(DocumentSource source, String url) throws Exception {
Any23 runner = new Any23();
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
TripleHandler handler = new NTriplesWriter(out);) {
runner.extract(source, handler);
return out.toString("UTF-8");
} catch (ExtractionException e) {
logger.error("Cannot extract triples", e);
} catch (IOException e1) {
logger.error(" IO error whilst extracting triples", e1);
} catch (TripleHandlerException e2) {
logger.error("TripleHanderException", e2);
return null;

View File

@ -1,35 +0,0 @@
package eu.dnetlib.dhp.bmuse.utils;
public class OptionsParameter {
private String paramName;
private String paramLongName;
private String paramDescription;
private boolean paramRequired;
private boolean compressed;
public String getParamName() {
return paramName;
public String getParamLongName() {
return paramLongName;
public String getParamDescription() {
return paramDescription;
public boolean isParamRequired() {
return paramRequired;
public boolean isCompressed() {
return compressed;
public void setCompressed(boolean compressed) {
this.compressed = compressed;

View File

@ -1,65 +0,0 @@
package eu.dnetlib.dhp.bmuse.utils;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import hwu.elixir.utils.Helpers;
public class UrlParser {
private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
Document doc = new Document(url);
Document urlSitemapListsNested;
Elements elements = new Elements();
Elements sitemaps = new Elements();
boolean sitemapindex = false;
boolean urlset = false;
try {
int urlLength = url.length();
logger.info("parse sitemap list");
String sitemapExt = url.substring(urlLength - 3, urlLength);
if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
logger.info("compressed sitemap");
byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
doc = Helpers.gzipFileDecompression(bytes);
} else {
doc = Jsoup.connect(url).maxBodySize(0).get();
} catch (IOException e) {
logger.error("Jsoup parsing exception: " + e.getMessage());
try {
elements = doc.select(sitemapURLKey);
// check the html if it is a sitemapindex or a urlset
sitemapindex = doc.outerHtml().contains("sitemapindex");
urlset = doc.outerHtml().contains("urlset");
} catch (NullPointerException e) {
if (sitemapindex) {
// if sitemapindex get the loc of all the sitemaps
// added warning for sitemap index files
"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
sitemaps = doc.select(sitemapURLKey);
return elements;

View File

@ -1,44 +0,0 @@
"paramName": "n",
"paramLongName": "nameNode",
"paramDescription": "the Name Node URI",
"paramRequired": true
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "the working path",
"paramRequired": true
"paramName": "r",
"paramLongName": "rdfOutput",
"paramDescription": "the working path",
"paramRequired": true
"paramName": "u",
"paramLongName": "sitemapUrl",
"paramDescription": "the sitemap url",
"paramRequired": true
"paramName": "k",
"paramLongName": "sitemapURLKey",
"paramDescription": "the sitemap file contains a list of xml entries, each one has a tag identified with sitemapURLKey with the url as value",
"paramRequired": true
"paramName": "d",
"paramLongName": "dynamic",
"paramDescription": "the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)",
"paramRequired": false
"paramName": "m",
"paramLongName": "maxScrapedPages",
"paramDescription": "max number of pages that will be scraped, default: no limit",
"paramRequired": false

View File

@ -1,22 +0,0 @@

View File

@ -1,81 +0,0 @@
<workflow-app name="BioSchemaHarvester" xmlns="uri:oozie:workflow:0.5">
<description>the working path</description>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
<description>max number of pages that will be scraped, default: no limit</description>
<description>rdf output of scraping step</description>
<value>-Xmx4g -Dwebdriver.chrome.whitelistedIps=</value>
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
<action name="ResetWorkingPath">
<delete path='${workingPath}${rdfOutput}'/>
<ok to="bmuseScraping"/>
<error to="Kill"/>
<action name="bmuseScraping">
<ok to="End"/>
<error to="Kill"/>
<end name="End"/>

View File

@ -1,4 +0,0 @@

View File

@ -1,9 +0,0 @@
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
# A1 uses PatternLayout.
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

View File

@ -1,45 +0,0 @@
package eu.dnetlib.dhp.bmuse.bioschema;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Html2TriplesTest {
static Logger logger = LoggerFactory.getLogger(Html2TriplesTest.class);
// @Disabled
void conversionTest() throws Exception {
InputStream is = Html2TriplesTest.class.getResourceAsStream("/eu/dnetlib/dhp/bmuse/bioschema/ped.html");
String page = IOUtils.toString(is, StandardCharsets.UTF_8.name());
DocumentSource source = new StringDocumentSource(page, "https://proteinensemble.org/PED00001");
Any23 runner = new Any23();
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
TripleHandler handler = new NTriplesWriter(out);) {
runner.extract(source, handler);
} catch (ExtractionException e) {
logger.error("Cannot extract triples", e);
} catch (IOException e1) {
logger.error(" IO error whilst extracting triples", e1);
} catch (TripleHandlerException e2) {
logger.error("TripleHanderException", e2);

View File

@ -1,24 +0,0 @@
package eu.dnetlib.dhp.bmuse.bioschema;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.bmuse.utils.UrlParser;
public class SitemapTest {
static Logger logger = LoggerFactory.getLogger(SitemapTest.class);
void sitemapGzTest() throws Exception {
Elements urls = UrlParser.getSitemapList("https://disprot.org/sitemap2.xml.gz", "loc");
urls.forEach(url -> {

File diff suppressed because one or more lines are too long

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-rdfconverter</artifactId> <artifactId>dhp-rdfconverter</artifactId>