forked from D-Net/dnet-hadoop
updated dhp-rdfconverter version to 1.2.5-SNAPSHOT
This commit is contained in:
parent
6fa9624c29
commit
baa312f256
|
@ -1,96 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-bmuse</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>hwu.elixir</groupId>
|
||||
<artifactId>bmuse-core</artifactId>
|
||||
<version>0.5.4</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.any23</groupId>
|
||||
<artifactId>apache-any23-core</artifactId>
|
||||
<version>2.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.rdf4j</groupId>
|
||||
<artifactId>rdf4j-rio-rdfxml</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.rdf4j</groupId>
|
||||
<artifactId>rdf4j-model</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</dependency>
|
||||
<!-- rdf 2.5.4 to 3.7.1-->
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.13.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.seleniumhq.selenium</groupId>
|
||||
<artifactId>selenium-java</artifactId>
|
||||
<version>3.141.59</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>2.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-validator</groupId>
|
||||
<artifactId>commons-validator</artifactId>
|
||||
<version>1.6</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>22.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.squareup.okhttp3</groupId>
|
||||
<artifactId>okhttp</artifactId>
|
||||
<version>3.11.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-compress</artifactId>
|
||||
<version>1.18</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
<version>2.9.6</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-annotations</artifactId>
|
||||
<version>2.9.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
<version>2.9.6</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
|
@ -1,62 +0,0 @@
|
|||
https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612
|
||||
|
||||
PED
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<value>/data/bioschema/ped/</value>
|
||||
<description>the working path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapUrl</name>
|
||||
<value>https://proteinensemble.org/sitemap2.xml.gz</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapURLKey</name>
|
||||
<value>loc</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dynamic</name>
|
||||
<value>true</value>
|
||||
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
|
||||
</property>
|
||||
|
||||
DISPROT
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<value>/data/bioschema/disprot/</value>
|
||||
<description>the working path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapUrl</name>
|
||||
<value>https://disprot.org/sitemap2.xml.gz</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapURLKey</name>
|
||||
<value>loc</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dynamic</name>
|
||||
<value>true</value>
|
||||
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
|
||||
</property>
|
||||
|
||||
MOBIDB
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<value>/data/bioschema/mobidb/</value>
|
||||
<description>the working path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapUrl</name>
|
||||
<value>https://mobidb.org/sitemap2.xml.gz</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapURLKey</name>
|
||||
<value>loc</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dynamic</name>
|
||||
<value>true</value>
|
||||
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
|
||||
</property>
|
||||
<property>
|
|
@ -1,113 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.bmuse.bioschema;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.bmuse.utils.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper;
|
||||
import eu.dnetlib.dhp.bmuse.utils.UrlParser;
|
||||
|
||||
public class ScrapingJob {
|
||||
|
||||
static Logger logger = LoggerFactory.getLogger(ScrapingJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
ScrapingJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String nameNode = parser.get("nameNode");
|
||||
final String workingPath = parser.get("workingPath");
|
||||
final String rdfOutput = parser.get("rdfOutput");
|
||||
final String sitemapUrl = parser.get("sitemapUrl");
|
||||
final String sitemapURLKey = parser.get("sitemapURLKey");
|
||||
final String dynamic = parser.get("dynamic");
|
||||
final String maxScrapedPages = parser.get("maxScrapedPages");
|
||||
Boolean dynamicValue = true;
|
||||
if (Objects.nonNull(dynamic)) {
|
||||
dynamicValue = Boolean.parseBoolean(dynamic);
|
||||
}
|
||||
final boolean scrapingType = dynamicValue.booleanValue();
|
||||
|
||||
logger
|
||||
.info(
|
||||
"*************************** STARTING_SCRAPE");
|
||||
|
||||
BMUSEScraper scraper = new BMUSEScraper();
|
||||
String url = sitemapUrl.toLowerCase();
|
||||
Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
|
||||
|
||||
Path output = new Path(
|
||||
nameNode
|
||||
.concat(workingPath)
|
||||
.concat(rdfOutput));
|
||||
Configuration conf = getHadoopConfiguration(nameNode);
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(output),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
|
||||
Stream<Element> urlStream = null;
|
||||
if (Objects.nonNull(maxScrapedPages)) {
|
||||
urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
|
||||
} else {
|
||||
urlStream = urls.stream();
|
||||
}
|
||||
List<Element> sites = urlStream.collect(Collectors.toList());
|
||||
logger.info("Pages available for scraping: " + sites.size());
|
||||
sites.forEach(u -> {
|
||||
final Text key = new Text(u.text());
|
||||
String nquads;
|
||||
try {
|
||||
String site = u.text();
|
||||
logger.debug(site + " > parsing");
|
||||
nquads = scraper.scrapeUrl(site, scrapingType);
|
||||
final Text value = new Text(nquads);
|
||||
writer.append(key, value);
|
||||
} catch (Throwable t) {
|
||||
logger.error(u.text() + " -> ", t);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
logger
|
||||
.info(
|
||||
"*************************** ENDING_SCRAPE: ");
|
||||
}
|
||||
|
||||
public static Configuration getHadoopConfiguration(String nameNode) {
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", nameNode);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
return conf;
|
||||
}
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.bmuse.utils;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.commons.cli.*;
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class ArgumentApplicationParser implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
|
||||
|
||||
private final Options options = new Options();
|
||||
private final Map<String, String> objectMap = new HashMap<>();
|
||||
|
||||
private final List<String> compressedValues = new ArrayList<>();
|
||||
|
||||
public ArgumentApplicationParser(final String json_configuration) throws IOException {
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
|
||||
createOptionMap(configuration);
|
||||
}
|
||||
|
||||
public ArgumentApplicationParser(final OptionsParameter[] configuration) {
|
||||
createOptionMap(configuration);
|
||||
}
|
||||
|
||||
private void createOptionMap(final OptionsParameter[] configuration) {
|
||||
Arrays
|
||||
.stream(configuration)
|
||||
.map(
|
||||
conf -> {
|
||||
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
|
||||
o.setLongOpt(conf.getParamLongName());
|
||||
o.setRequired(conf.isParamRequired());
|
||||
if (conf.isCompressed()) {
|
||||
compressedValues.add(conf.getParamLongName());
|
||||
}
|
||||
return o;
|
||||
})
|
||||
.forEach(options::addOption);
|
||||
}
|
||||
|
||||
public static String decompressValue(final String abstractCompressed) {
|
||||
try {
|
||||
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
||||
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
|
||||
final StringWriter stringWriter = new StringWriter();
|
||||
IOUtils.copy(gis, stringWriter);
|
||||
return stringWriter.toString();
|
||||
} catch (IOException e) {
|
||||
log.error("Wrong value to decompress: {}", abstractCompressed);
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String compressArgument(final String value) throws IOException {
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
GZIPOutputStream gzip = new GZIPOutputStream(out);
|
||||
gzip.write(value.getBytes());
|
||||
gzip.close();
|
||||
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
|
||||
}
|
||||
|
||||
public void parseArgument(final String[] args) throws ParseException {
|
||||
CommandLineParser parser = new BasicParser();
|
||||
CommandLine cmd = parser.parse(options, args);
|
||||
Arrays
|
||||
.stream(cmd.getOptions())
|
||||
.forEach(
|
||||
it -> objectMap
|
||||
.put(
|
||||
it.getLongOpt(),
|
||||
compressedValues.contains(it.getLongOpt())
|
||||
? decompressValue(it.getValue())
|
||||
: it.getValue()));
|
||||
}
|
||||
|
||||
public String get(final String key) {
|
||||
return objectMap.get(key);
|
||||
}
|
||||
|
||||
public Map<String, String> getObjectMap() {
|
||||
return objectMap;
|
||||
}
|
||||
}
|
|
@ -1,91 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.bmuse.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.any23.Any23;
|
||||
import org.apache.any23.extractor.ExtractionException;
|
||||
import org.apache.any23.source.DocumentSource;
|
||||
import org.apache.any23.source.StringDocumentSource;
|
||||
import org.apache.any23.writer.NTriplesWriter;
|
||||
import org.apache.any23.writer.TripleHandler;
|
||||
import org.apache.any23.writer.TripleHandlerException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.output.ByteArrayOutputStream;
|
||||
import org.eclipse.rdf4j.model.IRI;
|
||||
import org.eclipse.rdf4j.model.Model;
|
||||
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
|
||||
import org.eclipse.rdf4j.rio.RDFFormat;
|
||||
import org.eclipse.rdf4j.rio.Rio;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import hwu.elixir.scrape.exceptions.*;
|
||||
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
|
||||
|
||||
public class BMUSEScraper extends ScraperFilteredCore {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());
|
||||
|
||||
public String scrapeUrl(String url, Boolean dynamic) throws Exception {
|
||||
logger.debug(url + " > scraping");
|
||||
url = fixURL(url);
|
||||
|
||||
String html = "";
|
||||
// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
|
||||
// (dynamic and static respectively)
|
||||
|
||||
if (dynamic) {
|
||||
html = wrapHTMLExtraction(url);
|
||||
} else {
|
||||
html = wrapHTMLExtractionStatic(url);
|
||||
}
|
||||
|
||||
if (html == null || html.contentEquals(""))
|
||||
throw new Exception("empty html");
|
||||
|
||||
html = injectId(html, url);
|
||||
|
||||
logger.debug(url + " > html scraped from " + url);
|
||||
DocumentSource source = new StringDocumentSource(html, url);
|
||||
String n3 = html2Triples(source, url);
|
||||
if (n3 == null) {
|
||||
throw new MissingMarkupException(url);
|
||||
}
|
||||
|
||||
logger.debug(url + " > processing triples");
|
||||
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
|
||||
Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
|
||||
if (updatedModel == null) {
|
||||
throw new Exception("rdf model null");
|
||||
}
|
||||
|
||||
logger.debug(url + " > generating nquads");
|
||||
try (StringWriter jsonLDWriter = new StringWriter()) {
|
||||
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
|
||||
logger.debug(url + " > nquads generated");
|
||||
return jsonLDWriter.toString();
|
||||
} catch (Exception e) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private String html2Triples(DocumentSource source, String url) throws Exception {
|
||||
Any23 runner = new Any23();
|
||||
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
TripleHandler handler = new NTriplesWriter(out);) {
|
||||
runner.extract(source, handler);
|
||||
return out.toString("UTF-8");
|
||||
} catch (ExtractionException e) {
|
||||
logger.error("Cannot extract triples", e);
|
||||
} catch (IOException e1) {
|
||||
logger.error(" IO error whilst extracting triples", e1);
|
||||
} catch (TripleHandlerException e2) {
|
||||
logger.error("TripleHanderException", e2);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.bmuse.utils;
|
||||
|
||||
public class OptionsParameter {
|
||||
|
||||
private String paramName;
|
||||
private String paramLongName;
|
||||
private String paramDescription;
|
||||
private boolean paramRequired;
|
||||
private boolean compressed;
|
||||
|
||||
public String getParamName() {
|
||||
return paramName;
|
||||
}
|
||||
|
||||
public String getParamLongName() {
|
||||
return paramLongName;
|
||||
}
|
||||
|
||||
public String getParamDescription() {
|
||||
return paramDescription;
|
||||
}
|
||||
|
||||
public boolean isParamRequired() {
|
||||
return paramRequired;
|
||||
}
|
||||
|
||||
public boolean isCompressed() {
|
||||
return compressed;
|
||||
}
|
||||
|
||||
public void setCompressed(boolean compressed) {
|
||||
this.compressed = compressed;
|
||||
}
|
||||
}
|
|
@ -1,65 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.bmuse.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import hwu.elixir.utils.Helpers;
|
||||
|
||||
public class UrlParser {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
|
||||
|
||||
public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
|
||||
|
||||
Document doc = new Document(url);
|
||||
Document urlSitemapListsNested;
|
||||
Elements elements = new Elements();
|
||||
Elements sitemaps = new Elements();
|
||||
boolean sitemapindex = false;
|
||||
boolean urlset = false;
|
||||
|
||||
try {
|
||||
int urlLength = url.length();
|
||||
logger.info("parse sitemap list");
|
||||
String sitemapExt = url.substring(urlLength - 3, urlLength);
|
||||
if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
|
||||
logger.info("compressed sitemap");
|
||||
byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
|
||||
doc = Helpers.gzipFileDecompression(bytes);
|
||||
} else {
|
||||
doc = Jsoup.connect(url).maxBodySize(0).get();
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("Jsoup parsing exception: " + e.getMessage());
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
elements = doc.select(sitemapURLKey);
|
||||
|
||||
// check the html if it is a sitemapindex or a urlset
|
||||
sitemapindex = doc.outerHtml().contains("sitemapindex");
|
||||
urlset = doc.outerHtml().contains("urlset");
|
||||
} catch (NullPointerException e) {
|
||||
logger.error(e.getMessage());
|
||||
}
|
||||
|
||||
if (sitemapindex) {
|
||||
// if sitemapindex get the loc of all the sitemaps
|
||||
// added warning for sitemap index files
|
||||
logger
|
||||
.warn(
|
||||
"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
|
||||
sitemaps = doc.select(sitemapURLKey);
|
||||
}
|
||||
|
||||
return elements;
|
||||
}
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
[
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "nameNode",
|
||||
"paramDescription": "the Name Node URI",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "r",
|
||||
"paramLongName": "rdfOutput",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "sitemapUrl",
|
||||
"paramDescription": "the sitemap url",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "k",
|
||||
"paramLongName": "sitemapURLKey",
|
||||
"paramDescription": "the sitemap file contains a list of xml entries, each one has a tag identified with sitemapURLKey with the url as value",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "dynamic",
|
||||
"paramDescription": "the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "maxScrapedPages",
|
||||
"paramDescription": "max number of pages that will be scraped, default: no limit",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -1,22 +0,0 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarn</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,81 +0,0 @@
|
|||
<workflow-app name="BioSchemaHarvester" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<value>/data/bioschema/mobidb/</value>
|
||||
<description>the working path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapUrl</name>
|
||||
<value>https://mobidb.org/sitemap2.xml.gz</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapURLKey</name>
|
||||
<value>loc</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dynamic</name>
|
||||
<value>true</value>
|
||||
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxScrapedPages</name>
|
||||
<value>5</value>
|
||||
<description>max number of pages that will be scraped, default: no limit</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>rdfOutput</name>
|
||||
<value>nquads.seq</value>
|
||||
<description>rdf output of scraping step</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>scraping_java_opts</name>
|
||||
<value>-Xmx4g -Dwebdriver.chrome.whitelistedIps=</value>
|
||||
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
</global>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}${rdfOutput}'/>
|
||||
</fs>
|
||||
<ok to="bmuseScraping"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="bmuseScraping">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.bmuse.bioschema.ScrapingJob</main-class>
|
||||
<java-opts>${scraping_java_opts}</java-opts>
|
||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--rdfOutput</arg><arg>${rdfOutput}</arg>
|
||||
<arg>--sitemapUrl</arg><arg>${sitemapUrl}</arg>
|
||||
<arg>--sitemapURLKey</arg><arg>${sitemapURLKey}</arg>
|
||||
<arg>--dynamic</arg><arg>${dynamic}</arg>
|
||||
<arg>--maxScrapedPages</arg><arg>${maxScrapedPages}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,4 +0,0 @@
|
|||
maxLimitScrape=200000
|
||||
schemaContext=https\://schema.org/docs/jsonldcontext.jsonld
|
||||
dynamic=true
|
||||
chromiumDriverLocation=/bin/chromedriver
|
|
@ -1,9 +0,0 @@
|
|||
# Set root logger level to DEBUG and its only appender to A1.
|
||||
log4j.rootLogger=INFO, A1
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
||||
|
||||
# A1 uses PatternLayout.
|
||||
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
|
@ -1,45 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.bmuse.bioschema;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.any23.Any23;
|
||||
import org.apache.any23.extractor.ExtractionException;
|
||||
import org.apache.any23.source.DocumentSource;
|
||||
import org.apache.any23.source.StringDocumentSource;
|
||||
import org.apache.any23.writer.NTriplesWriter;
|
||||
import org.apache.any23.writer.TripleHandler;
|
||||
import org.apache.any23.writer.TripleHandlerException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.output.ByteArrayOutputStream;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class Html2TriplesTest {
|
||||
|
||||
static Logger logger = LoggerFactory.getLogger(Html2TriplesTest.class);
|
||||
|
||||
@Test
|
||||
// @Disabled
|
||||
void conversionTest() throws Exception {
|
||||
InputStream is = Html2TriplesTest.class.getResourceAsStream("/eu/dnetlib/dhp/bmuse/bioschema/ped.html");
|
||||
String page = IOUtils.toString(is, StandardCharsets.UTF_8.name());
|
||||
DocumentSource source = new StringDocumentSource(page, "https://proteinensemble.org/PED00001");
|
||||
Any23 runner = new Any23();
|
||||
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
TripleHandler handler = new NTriplesWriter(out);) {
|
||||
runner.extract(source, handler);
|
||||
logger.info(out.toString("UTF-8"));
|
||||
} catch (ExtractionException e) {
|
||||
logger.error("Cannot extract triples", e);
|
||||
} catch (IOException e1) {
|
||||
logger.error(" IO error whilst extracting triples", e1);
|
||||
} catch (TripleHandlerException e2) {
|
||||
logger.error("TripleHanderException", e2);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,24 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.bmuse.bioschema;
|
||||
|
||||
import org.jsoup.select.Elements;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.bmuse.utils.UrlParser;
|
||||
|
||||
public class SitemapTest {
|
||||
|
||||
static Logger logger = LoggerFactory.getLogger(SitemapTest.class);
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void sitemapGzTest() throws Exception {
|
||||
Elements urls = UrlParser.getSitemapList("https://disprot.org/sitemap2.xml.gz", "loc");
|
||||
urls.forEach(url -> {
|
||||
logger.info(url.text());
|
||||
});
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-rdfconverter</artifactId>
|
||||
|
||||
|
|
Loading…
Reference in New Issue