updated dhp-rdfconverter version to 1.2.5-SNAPSHOT

This commit is contained in:
Enrico Ottonello 2022-05-11 11:20:16 +02:00
parent 6fa9624c29
commit baa312f256
16 changed files with 1 additions and 823 deletions

View File

@ -1,96 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-bmuse</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>hwu.elixir</groupId>
<artifactId>bmuse-core</artifactId>
<version>0.5.4</version>
</dependency>
<dependency>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-core</artifactId>
<version>2.3</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-rio-rdfxml</artifactId>
<version>3.7.1</version>
</dependency>
<dependency>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-model</artifactId>
<version>3.7.1</version>
</dependency>
<!-- rdf 2.5.4 to 3.7.1-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>22.0</version>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.18</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.9.6</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.9.6</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.9.6</version>
</dependency>
</dependencies>
</project>

View File

@ -1,62 +0,0 @@
https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612
PED
<property>
<name>workingPath</name>
<value>/data/bioschema/ped/</value>
<description>the working path</description>
</property>
<property>
<name>sitemapUrl</name>
<value>https://proteinensemble.org/sitemap2.xml.gz</value>
</property>
<property>
<name>sitemapURLKey</name>
<value>loc</value>
</property>
<property>
<name>dynamic</name>
<value>true</value>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
DISPROT
<property>
<name>workingPath</name>
<value>/data/bioschema/disprot/</value>
<description>the working path</description>
</property>
<property>
<name>sitemapUrl</name>
<value>https://disprot.org/sitemap2.xml.gz</value>
</property>
<property>
<name>sitemapURLKey</name>
<value>loc</value>
</property>
<property>
<name>dynamic</name>
<value>true</value>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
MOBIDB
<property>
<name>workingPath</name>
<value>/data/bioschema/mobidb/</value>
<description>the working path</description>
</property>
<property>
<name>sitemapUrl</name>
<value>https://mobidb.org/sitemap2.xml.gz</value>
</property>
<property>
<name>sitemapURLKey</name>
<value>loc</value>
</property>
<property>
<name>dynamic</name>
<value>true</value>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
<property>

View File

@ -1,113 +0,0 @@
package eu.dnetlib.dhp.bmuse.bioschema;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.bmuse.utils.ArgumentApplicationParser;
import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper;
import eu.dnetlib.dhp.bmuse.utils.UrlParser;
public class ScrapingJob {
static Logger logger = LoggerFactory.getLogger(ScrapingJob.class);
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
ScrapingJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json")));
parser.parseArgument(args);
final String nameNode = parser.get("nameNode");
final String workingPath = parser.get("workingPath");
final String rdfOutput = parser.get("rdfOutput");
final String sitemapUrl = parser.get("sitemapUrl");
final String sitemapURLKey = parser.get("sitemapURLKey");
final String dynamic = parser.get("dynamic");
final String maxScrapedPages = parser.get("maxScrapedPages");
Boolean dynamicValue = true;
if (Objects.nonNull(dynamic)) {
dynamicValue = Boolean.parseBoolean(dynamic);
}
final boolean scrapingType = dynamicValue.booleanValue();
logger
.info(
"*************************** STARTING_SCRAPE");
BMUSEScraper scraper = new BMUSEScraper();
String url = sitemapUrl.toLowerCase();
Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
Path output = new Path(
nameNode
.concat(workingPath)
.concat(rdfOutput));
Configuration conf = getHadoopConfiguration(nameNode);
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(output),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
Stream<Element> urlStream = null;
if (Objects.nonNull(maxScrapedPages)) {
urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
} else {
urlStream = urls.stream();
}
List<Element> sites = urlStream.collect(Collectors.toList());
logger.info("Pages available for scraping: " + sites.size());
sites.forEach(u -> {
final Text key = new Text(u.text());
String nquads;
try {
String site = u.text();
logger.debug(site + " > parsing");
nquads = scraper.scrapeUrl(site, scrapingType);
final Text value = new Text(nquads);
writer.append(key, value);
} catch (Throwable t) {
logger.error(u.text() + " -> ", t);
}
});
}
logger
.info(
"*************************** ENDING_SCRAPE: ");
}
public static Configuration getHadoopConfiguration(String nameNode) {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", nameNode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
return conf;
}
}

View File

@ -1,94 +0,0 @@
package eu.dnetlib.dhp.bmuse.utils;
import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.*;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class ArgumentApplicationParser implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
private final Options options = new Options();
private final Map<String, String> objectMap = new HashMap<>();
private final List<String> compressedValues = new ArrayList<>();
public ArgumentApplicationParser(final String json_configuration) throws IOException {
final ObjectMapper mapper = new ObjectMapper();
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
createOptionMap(configuration);
}
public ArgumentApplicationParser(final OptionsParameter[] configuration) {
createOptionMap(configuration);
}
private void createOptionMap(final OptionsParameter[] configuration) {
Arrays
.stream(configuration)
.map(
conf -> {
final Option o = new Option(conf.getParamName(), true, conf.getParamDescription());
o.setLongOpt(conf.getParamLongName());
o.setRequired(conf.isParamRequired());
if (conf.isCompressed()) {
compressedValues.add(conf.getParamLongName());
}
return o;
})
.forEach(options::addOption);
}
public static String decompressValue(final String abstractCompressed) {
try {
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
final StringWriter stringWriter = new StringWriter();
IOUtils.copy(gis, stringWriter);
return stringWriter.toString();
} catch (IOException e) {
log.error("Wrong value to decompress: {}", abstractCompressed);
throw new IllegalArgumentException(e);
}
}
public static String compressArgument(final String value) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(value.getBytes());
gzip.close();
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
}
public void parseArgument(final String[] args) throws ParseException {
CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse(options, args);
Arrays
.stream(cmd.getOptions())
.forEach(
it -> objectMap
.put(
it.getLongOpt(),
compressedValues.contains(it.getLongOpt())
? decompressValue(it.getValue())
: it.getValue()));
}
public String get(final String key) {
return objectMap.get(key);
}
public Map<String, String> getObjectMap() {
return objectMap;
}
}

View File

@ -1,91 +0,0 @@
package eu.dnetlib.dhp.bmuse.utils;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.Rio;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import hwu.elixir.scrape.exceptions.*;
import hwu.elixir.scrape.scraper.ScraperFilteredCore;
public class BMUSEScraper extends ScraperFilteredCore {
private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());
public String scrapeUrl(String url, Boolean dynamic) throws Exception {
logger.debug(url + " > scraping");
url = fixURL(url);
String html = "";
// The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
// (dynamic and static respectively)
if (dynamic) {
html = wrapHTMLExtraction(url);
} else {
html = wrapHTMLExtractionStatic(url);
}
if (html == null || html.contentEquals(""))
throw new Exception("empty html");
html = injectId(html, url);
logger.debug(url + " > html scraped from " + url);
DocumentSource source = new StringDocumentSource(html, url);
String n3 = html2Triples(source, url);
if (n3 == null) {
throw new MissingMarkupException(url);
}
logger.debug(url + " > processing triples");
IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l);
if (updatedModel == null) {
throw new Exception("rdf model null");
}
logger.debug(url + " > generating nquads");
try (StringWriter jsonLDWriter = new StringWriter()) {
Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
logger.debug(url + " > nquads generated");
return jsonLDWriter.toString();
} catch (Exception e) {
throw e;
}
}
private String html2Triples(DocumentSource source, String url) throws Exception {
Any23 runner = new Any23();
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
TripleHandler handler = new NTriplesWriter(out);) {
runner.extract(source, handler);
return out.toString("UTF-8");
} catch (ExtractionException e) {
logger.error("Cannot extract triples", e);
} catch (IOException e1) {
logger.error(" IO error whilst extracting triples", e1);
} catch (TripleHandlerException e2) {
logger.error("TripleHanderException", e2);
}
return null;
}
}

View File

@ -1,35 +0,0 @@
package eu.dnetlib.dhp.bmuse.utils;
public class OptionsParameter {
private String paramName;
private String paramLongName;
private String paramDescription;
private boolean paramRequired;
private boolean compressed;
public String getParamName() {
return paramName;
}
public String getParamLongName() {
return paramLongName;
}
public String getParamDescription() {
return paramDescription;
}
public boolean isParamRequired() {
return paramRequired;
}
public boolean isCompressed() {
return compressed;
}
public void setCompressed(boolean compressed) {
this.compressed = compressed;
}
}

View File

@ -1,65 +0,0 @@
package eu.dnetlib.dhp.bmuse.utils;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import hwu.elixir.utils.Helpers;
public class UrlParser {
private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
Document doc = new Document(url);
Document urlSitemapListsNested;
Elements elements = new Elements();
Elements sitemaps = new Elements();
boolean sitemapindex = false;
boolean urlset = false;
try {
int urlLength = url.length();
logger.info("parse sitemap list");
String sitemapExt = url.substring(urlLength - 3, urlLength);
if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
logger.info("compressed sitemap");
byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
doc = Helpers.gzipFileDecompression(bytes);
} else {
doc = Jsoup.connect(url).maxBodySize(0).get();
}
} catch (IOException e) {
logger.error("Jsoup parsing exception: " + e.getMessage());
}
try {
elements = doc.select(sitemapURLKey);
// check the html if it is a sitemapindex or a urlset
sitemapindex = doc.outerHtml().contains("sitemapindex");
urlset = doc.outerHtml().contains("urlset");
} catch (NullPointerException e) {
logger.error(e.getMessage());
}
if (sitemapindex) {
// if sitemapindex get the loc of all the sitemaps
// added warning for sitemap index files
logger
.warn(
"please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
sitemaps = doc.select(sitemapURLKey);
}
return elements;
}
}

View File

@ -1,44 +0,0 @@
[
{
"paramName": "n",
"paramLongName": "nameNode",
"paramDescription": "the Name Node URI",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "the working path",
"paramRequired": true
},
{
"paramName": "r",
"paramLongName": "rdfOutput",
"paramDescription": "the working path",
"paramRequired": true
},
{
"paramName": "u",
"paramLongName": "sitemapUrl",
"paramDescription": "the sitemap url",
"paramRequired": true
},
{
"paramName": "k",
"paramLongName": "sitemapURLKey",
"paramDescription": "the sitemap file contains a list of xml entries, each one has a tag identified with sitemapURLKey with the url as value",
"paramRequired": true
},
{
"paramName": "d",
"paramLongName": "dynamic",
"paramDescription": "the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)",
"paramRequired": false
},
{
"paramName": "m",
"paramLongName": "maxScrapedPages",
"paramDescription": "max number of pages that will be scraped, default: no limit",
"paramRequired": false
}
]

View File

@ -1,22 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarn</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -1,81 +0,0 @@
<workflow-app name="BioSchemaHarvester" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<value>/data/bioschema/mobidb/</value>
<description>the working path</description>
</property>
<property>
<name>sitemapUrl</name>
<value>https://mobidb.org/sitemap2.xml.gz</value>
</property>
<property>
<name>sitemapURLKey</name>
<value>loc</value>
</property>
<property>
<name>dynamic</name>
<value>true</value>
<description>the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)</description>
</property>
<property>
<name>maxScrapedPages</name>
<value>5</value>
<description>max number of pages that will be scraped, default: no limit</description>
</property>
<property>
<name>rdfOutput</name>
<value>nquads.seq</value>
<description>rdf output of scraping step</description>
</property>
<property>
<name>scraping_java_opts</name>
<value>-Xmx4g -Dwebdriver.chrome.whitelistedIps=</value>
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
</global>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}${rdfOutput}'/>
</fs>
<ok to="bmuseScraping"/>
<error to="Kill"/>
</action>
<action name="bmuseScraping">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.bmuse.bioschema.ScrapingJob</main-class>
<java-opts>${scraping_java_opts}</java-opts>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--rdfOutput</arg><arg>${rdfOutput}</arg>
<arg>--sitemapUrl</arg><arg>${sitemapUrl}</arg>
<arg>--sitemapURLKey</arg><arg>${sitemapURLKey}</arg>
<arg>--dynamic</arg><arg>${dynamic}</arg>
<arg>--maxScrapedPages</arg><arg>${maxScrapedPages}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,4 +0,0 @@
maxLimitScrape=200000
schemaContext=https\://schema.org/docs/jsonldcontext.jsonld
dynamic=true
chromiumDriverLocation=/bin/chromedriver

View File

@ -1,9 +0,0 @@
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

View File

@ -1,45 +0,0 @@
package eu.dnetlib.dhp.bmuse.bioschema;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Html2TriplesTest {
static Logger logger = LoggerFactory.getLogger(Html2TriplesTest.class);
@Test
// @Disabled
void conversionTest() throws Exception {
InputStream is = Html2TriplesTest.class.getResourceAsStream("/eu/dnetlib/dhp/bmuse/bioschema/ped.html");
String page = IOUtils.toString(is, StandardCharsets.UTF_8.name());
DocumentSource source = new StringDocumentSource(page, "https://proteinensemble.org/PED00001");
Any23 runner = new Any23();
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
TripleHandler handler = new NTriplesWriter(out);) {
runner.extract(source, handler);
logger.info(out.toString("UTF-8"));
} catch (ExtractionException e) {
logger.error("Cannot extract triples", e);
} catch (IOException e1) {
logger.error(" IO error whilst extracting triples", e1);
} catch (TripleHandlerException e2) {
logger.error("TripleHanderException", e2);
}
}
}

View File

@ -1,24 +0,0 @@
package eu.dnetlib.dhp.bmuse.bioschema;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.bmuse.utils.UrlParser;
public class SitemapTest {
static Logger logger = LoggerFactory.getLogger(SitemapTest.class);
@Test
@Disabled
void sitemapGzTest() throws Exception {
Elements urls = UrlParser.getSitemapList("https://disprot.org/sitemap2.xml.gz", "loc");
urls.forEach(url -> {
logger.info(url.text());
});
}
}

File diff suppressed because one or more lines are too long

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-rdfconverter</artifactId>