From baa312f2561cd6e05ccccb797a2f7c334e77d3e1 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Wed, 11 May 2022 11:20:16 +0200 Subject: [PATCH] updated dhp-rdfconverter version to 1.2.5-SNAPSHOT --- dhp-workflows/dhp-bmuse/pom.xml | 96 --------------- dhp-workflows/dhp-bmuse/sitemap.txt | 62 ---------- .../dhp/bmuse/bioschema/ScrapingJob.java | 113 ------------------ .../utils/ArgumentApplicationParser.java | 94 --------------- .../dnetlib/dhp/bmuse/utils/BMUSEScraper.java | 91 -------------- .../dhp/bmuse/utils/OptionsParameter.java | 35 ------ .../eu/dnetlib/dhp/bmuse/utils/UrlParser.java | 65 ---------- .../dhp/bmuse/bioschema/generate_dataset.json | 44 ------- .../bioschema/oozie_app/config-default.xml | 22 ---- .../bmuse/bioschema/oozie_app/workflow.xml | 81 ------------- .../src/main/resources/localconfig.properties | 4 - .../src/main/resources/log4j.properties | 9 -- .../dhp/bmuse/bioschema/Html2TriplesTest.java | 45 ------- .../dhp/bmuse/bioschema/SitemapTest.java | 24 ---- .../eu/dnetlib/dhp/bmuse/bioschema/ped.html | 37 ------ dhp-workflows/dhp-rdfconverter/pom.xml | 2 +- 16 files changed, 1 insertion(+), 823 deletions(-) delete mode 100644 dhp-workflows/dhp-bmuse/pom.xml delete mode 100644 dhp-workflows/dhp-bmuse/sitemap.txt delete mode 100644 dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java delete mode 100644 dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/ArgumentApplicationParser.java delete mode 100644 dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java delete mode 100644 dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/OptionsParameter.java delete mode 100644 dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java delete mode 100644 dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json delete mode 100644 dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml delete mode 100644 dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties delete mode 100644 dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties delete mode 100644 dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java delete mode 100644 dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java delete mode 100644 dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html diff --git a/dhp-workflows/dhp-bmuse/pom.xml b/dhp-workflows/dhp-bmuse/pom.xml deleted file mode 100644 index 6b81f4cba..000000000 --- a/dhp-workflows/dhp-bmuse/pom.xml +++ /dev/null @@ -1,96 +0,0 @@ - - - 4.0.0 - - eu.dnetlib.dhp - dhp-workflows - 1.2.4-SNAPSHOT - - dhp-bmuse - - - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - hwu.elixir - bmuse-core - 0.5.4 - - - - org.apache.any23 - apache-any23-core - 2.3 - - - org.eclipse.rdf4j - rdf4j-rio-rdfxml - 3.7.1 - - - org.eclipse.rdf4j - rdf4j-model - 3.7.1 - - - - org.jsoup - jsoup - 1.13.1 - - - org.seleniumhq.selenium - selenium-java - 3.141.59 - - - commons-io - commons-io - 2.6 - - - commons-validator - commons-validator - 1.6 - - - - com.google.guava - guava - 22.0 - - - com.squareup.okhttp3 - okhttp - 3.11.0 - - - org.apache.commons - commons-compress - 1.18 - - - com.fasterxml.jackson.core - jackson-core - 2.9.6 - - - - com.fasterxml.jackson.core - jackson-annotations - 2.9.6 - - - com.fasterxml.jackson.core - jackson-databind - 2.9.6 - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/sitemap.txt b/dhp-workflows/dhp-bmuse/sitemap.txt deleted file mode 100644 index d8ed5ebe1..000000000 --- a/dhp-workflows/dhp-bmuse/sitemap.txt +++ /dev/null @@ -1,62 +0,0 @@ -https://grafana.d4science.org/d/xfpJB9FGz-pa1/1-node-exporter-garr-pa1?orgId=1&var-origin_prometheus=&var-job=node&var-hostname=hadoop-worker8.garr-pa1.d4science.org&var-node=hadoop-worker-8&var-device=All&var-interval=2m&var-maxmount=%2Fhadoop&var-show_hostname=hadoop-worker8.garr-pa1.d4science.org&var-total=49&from=1638522510612&to=1638526110612 - -PED - - workingPath - /data/bioschema/ped/ - the working path - - - sitemapUrl - https://proteinensemble.org/sitemap2.xml.gz - - - sitemapURLKey - loc - - - dynamic - true - the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively) - - -DISPROT - - workingPath - /data/bioschema/disprot/ - the working path - - - sitemapUrl - https://disprot.org/sitemap2.xml.gz - - - sitemapURLKey - loc - - - dynamic - true - the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively) - - -MOBIDB - - workingPath - /data/bioschema/mobidb/ - the working path - - - sitemapUrl - https://mobidb.org/sitemap2.xml.gz - - - sitemapURLKey - loc - - - dynamic - true - the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively) - - diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java deleted file mode 100644 index 5a8c11a5b..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java +++ /dev/null @@ -1,113 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.bioschema; - -import java.text.SimpleDateFormat; -import java.util.Arrays; -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.compress.GzipCodec; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.bmuse.utils.ArgumentApplicationParser; -import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper; -import eu.dnetlib.dhp.bmuse.utils.UrlParser; - -public class ScrapingJob { - - static Logger logger = LoggerFactory.getLogger(ScrapingJob.class); - - public static void main(String[] args) throws Exception { - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - ScrapingJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json"))); - parser.parseArgument(args); - - final String nameNode = parser.get("nameNode"); - final String workingPath = parser.get("workingPath"); - final String rdfOutput = parser.get("rdfOutput"); - final String sitemapUrl = parser.get("sitemapUrl"); - final String sitemapURLKey = parser.get("sitemapURLKey"); - final String dynamic = parser.get("dynamic"); - final String maxScrapedPages = parser.get("maxScrapedPages"); - Boolean dynamicValue = true; - if (Objects.nonNull(dynamic)) { - dynamicValue = Boolean.parseBoolean(dynamic); - } - final boolean scrapingType = dynamicValue.booleanValue(); - - logger - .info( - "*************************** STARTING_SCRAPE"); - - BMUSEScraper scraper = new BMUSEScraper(); - String url = sitemapUrl.toLowerCase(); - Elements urls = UrlParser.getSitemapList(url, sitemapURLKey); - - Path output = new Path( - nameNode - .concat(workingPath) - .concat(rdfOutput)); - Configuration conf = getHadoopConfiguration(nameNode); - try (SequenceFile.Writer writer = SequenceFile - .createWriter( - conf, - SequenceFile.Writer.file(output), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class), - SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { - Stream urlStream = null; - if (Objects.nonNull(maxScrapedPages)) { - urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages)); - } else { - urlStream = urls.stream(); - } - List sites = urlStream.collect(Collectors.toList()); - logger.info("Pages available for scraping: " + sites.size()); - sites.forEach(u -> { - final Text key = new Text(u.text()); - String nquads; - try { - String site = u.text(); - logger.debug(site + " > parsing"); - nquads = scraper.scrapeUrl(site, scrapingType); - final Text value = new Text(nquads); - writer.append(key, value); - } catch (Throwable t) { - logger.error(u.text() + " -> ", t); - } - }); - } - - logger - .info( - "*************************** ENDING_SCRAPE: "); - } - - public static Configuration getHadoopConfiguration(String nameNode) { - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", nameNode); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - System.setProperty("hadoop.home.dir", "/"); - return conf; - } -} diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/ArgumentApplicationParser.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/ArgumentApplicationParser.java deleted file mode 100644 index 93efe93e9..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/ArgumentApplicationParser.java +++ /dev/null @@ -1,94 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.utils; - -import java.io.*; -import java.util.*; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -import org.apache.commons.cli.*; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -public class ArgumentApplicationParser implements Serializable { - - private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class); - - private final Options options = new Options(); - private final Map objectMap = new HashMap<>(); - - private final List compressedValues = new ArrayList<>(); - - public ArgumentApplicationParser(final String json_configuration) throws IOException { - final ObjectMapper mapper = new ObjectMapper(); - final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); - createOptionMap(configuration); - } - - public ArgumentApplicationParser(final OptionsParameter[] configuration) { - createOptionMap(configuration); - } - - private void createOptionMap(final OptionsParameter[] configuration) { - Arrays - .stream(configuration) - .map( - conf -> { - final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); - o.setLongOpt(conf.getParamLongName()); - o.setRequired(conf.isParamRequired()); - if (conf.isCompressed()) { - compressedValues.add(conf.getParamLongName()); - } - return o; - }) - .forEach(options::addOption); - } - - public static String decompressValue(final String abstractCompressed) { - try { - byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); - GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); - final StringWriter stringWriter = new StringWriter(); - IOUtils.copy(gis, stringWriter); - return stringWriter.toString(); - } catch (IOException e) { - log.error("Wrong value to decompress: {}", abstractCompressed); - throw new IllegalArgumentException(e); - } - } - - public static String compressArgument(final String value) throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - GZIPOutputStream gzip = new GZIPOutputStream(out); - gzip.write(value.getBytes()); - gzip.close(); - return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); - } - - public void parseArgument(final String[] args) throws ParseException { - CommandLineParser parser = new BasicParser(); - CommandLine cmd = parser.parse(options, args); - Arrays - .stream(cmd.getOptions()) - .forEach( - it -> objectMap - .put( - it.getLongOpt(), - compressedValues.contains(it.getLongOpt()) - ? decompressValue(it.getValue()) - : it.getValue())); - } - - public String get(final String key) { - return objectMap.get(key); - } - - public Map getObjectMap() { - return objectMap; - } -} diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java deleted file mode 100644 index c4924f4d3..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java +++ /dev/null @@ -1,91 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.io.StringWriter; -import java.nio.charset.StandardCharsets; - -import org.apache.any23.Any23; -import org.apache.any23.extractor.ExtractionException; -import org.apache.any23.source.DocumentSource; -import org.apache.any23.source.StringDocumentSource; -import org.apache.any23.writer.NTriplesWriter; -import org.apache.any23.writer.TripleHandler; -import org.apache.any23.writer.TripleHandlerException; -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.output.ByteArrayOutputStream; -import org.eclipse.rdf4j.model.IRI; -import org.eclipse.rdf4j.model.Model; -import org.eclipse.rdf4j.model.impl.SimpleValueFactory; -import org.eclipse.rdf4j.rio.RDFFormat; -import org.eclipse.rdf4j.rio.Rio; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import hwu.elixir.scrape.exceptions.*; -import hwu.elixir.scrape.scraper.ScraperFilteredCore; - -public class BMUSEScraper extends ScraperFilteredCore { - - private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName()); - - public String scrapeUrl(String url, Boolean dynamic) throws Exception { - logger.debug(url + " > scraping"); - url = fixURL(url); - - String html = ""; - // The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information - // (dynamic and static respectively) - - if (dynamic) { - html = wrapHTMLExtraction(url); - } else { - html = wrapHTMLExtractionStatic(url); - } - - if (html == null || html.contentEquals("")) - throw new Exception("empty html"); - - html = injectId(html, url); - - logger.debug(url + " > html scraped from " + url); - DocumentSource source = new StringDocumentSource(html, url); - String n3 = html2Triples(source, url); - if (n3 == null) { - throw new MissingMarkupException(url); - } - - logger.debug(url + " > processing triples"); - IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI()); - Model updatedModel = updatedModel = processTriples(n3, sourceIRI, 0l); - if (updatedModel == null) { - throw new Exception("rdf model null"); - } - - logger.debug(url + " > generating nquads"); - try (StringWriter jsonLDWriter = new StringWriter()) { - Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS); - logger.debug(url + " > nquads generated"); - return jsonLDWriter.toString(); - } catch (Exception e) { - throw e; - } - } - - private String html2Triples(DocumentSource source, String url) throws Exception { - Any23 runner = new Any23(); - try (ByteArrayOutputStream out = new ByteArrayOutputStream(); - TripleHandler handler = new NTriplesWriter(out);) { - runner.extract(source, handler); - return out.toString("UTF-8"); - } catch (ExtractionException e) { - logger.error("Cannot extract triples", e); - } catch (IOException e1) { - logger.error(" IO error whilst extracting triples", e1); - } catch (TripleHandlerException e2) { - logger.error("TripleHanderException", e2); - } - return null; - } -} diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/OptionsParameter.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/OptionsParameter.java deleted file mode 100644 index f9dc13558..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/OptionsParameter.java +++ /dev/null @@ -1,35 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.utils; - -public class OptionsParameter { - - private String paramName; - private String paramLongName; - private String paramDescription; - private boolean paramRequired; - private boolean compressed; - - public String getParamName() { - return paramName; - } - - public String getParamLongName() { - return paramLongName; - } - - public String getParamDescription() { - return paramDescription; - } - - public boolean isParamRequired() { - return paramRequired; - } - - public boolean isCompressed() { - return compressed; - } - - public void setCompressed(boolean compressed) { - this.compressed = compressed; - } -} diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java deleted file mode 100644 index c1c626b69..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java +++ /dev/null @@ -1,65 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.utils; - -import java.io.IOException; - -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import hwu.elixir.utils.Helpers; - -public class UrlParser { - - private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName()); - - public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException { - - Document doc = new Document(url); - Document urlSitemapListsNested; - Elements elements = new Elements(); - Elements sitemaps = new Elements(); - boolean sitemapindex = false; - boolean urlset = false; - - try { - int urlLength = url.length(); - logger.info("parse sitemap list"); - String sitemapExt = url.substring(urlLength - 3, urlLength); - if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending - logger.info("compressed sitemap"); - byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes(); - doc = Helpers.gzipFileDecompression(bytes); - } else { - doc = Jsoup.connect(url).maxBodySize(0).get(); - } - - } catch (IOException e) { - logger.error("Jsoup parsing exception: " + e.getMessage()); - } - - try { - - elements = doc.select(sitemapURLKey); - - // check the html if it is a sitemapindex or a urlset - sitemapindex = doc.outerHtml().contains("sitemapindex"); - urlset = doc.outerHtml().contains("urlset"); - } catch (NullPointerException e) { - logger.error(e.getMessage()); - } - - if (sitemapindex) { - // if sitemapindex get the loc of all the sitemaps - // added warning for sitemap index files - logger - .warn( - "please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead"); - sitemaps = doc.select(sitemapURLKey); - } - - return elements; - } -} diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json deleted file mode 100644 index 1ac0b50de..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json +++ /dev/null @@ -1,44 +0,0 @@ -[ - { - "paramName": "n", - "paramLongName": "nameNode", - "paramDescription": "the Name Node URI", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workingPath", - "paramDescription": "the working path", - "paramRequired": true - }, - { - "paramName": "r", - "paramLongName": "rdfOutput", - "paramDescription": "the working path", - "paramRequired": true - }, - { - "paramName": "u", - "paramLongName": "sitemapUrl", - "paramDescription": "the sitemap url", - "paramRequired": true - }, - { - "paramName": "k", - "paramLongName": "sitemapURLKey", - "paramDescription": "the sitemap file contains a list of xml entries, each one has a tag identified with sitemapURLKey with the url as value", - "paramRequired": true - }, - { - "paramName": "d", - "paramLongName": "dynamic", - "paramDescription": "the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)", - "paramRequired": false - }, - { - "paramName": "m", - "paramLongName": "maxScrapedPages", - "paramDescription": "max number of pages that will be scraped, default: no limit", - "paramRequired": false - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml deleted file mode 100644 index c5d960eb1..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - jobTracker - yarn - - - nameNode - hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 - - - oozie.launcher.mapreduce.user.classpath.first - true - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml deleted file mode 100644 index 636babf07..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml +++ /dev/null @@ -1,81 +0,0 @@ - - - - workingPath - /data/bioschema/mobidb/ - the working path - - - sitemapUrl - https://mobidb.org/sitemap2.xml.gz - - - sitemapURLKey - loc - - - dynamic - true - the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively) - - - maxScrapedPages - 5 - max number of pages that will be scraped, default: no limit - - - rdfOutput - nquads.seq - rdf output of scraping step - - - scraping_java_opts - -Xmx4g -Dwebdriver.chrome.whitelistedIps= - Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb. - - - - - ${jobTracker} - ${nameNode} - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - - - oozie.launcher.mapreduce.user.classpath.first - true - - - eu.dnetlib.dhp.bmuse.bioschema.ScrapingJob - ${scraping_java_opts} - --nameNode${nameNode} - --workingPath${workingPath} - --rdfOutput${rdfOutput} - --sitemapUrl${sitemapUrl} - --sitemapURLKey${sitemapURLKey} - --dynamic${dynamic} - --maxScrapedPages${maxScrapedPages} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties b/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties deleted file mode 100644 index 26f94f2df..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties +++ /dev/null @@ -1,4 +0,0 @@ -maxLimitScrape=200000 -schemaContext=https\://schema.org/docs/jsonldcontext.jsonld -dynamic=true -chromiumDriverLocation=/bin/chromedriver \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties b/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties deleted file mode 100644 index 63cba917e..000000000 --- a/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties +++ /dev/null @@ -1,9 +0,0 @@ -# Set root logger level to DEBUG and its only appender to A1. -log4j.rootLogger=INFO, A1 - -# A1 is set to be a ConsoleAppender. -log4j.appender.A1=org.apache.log4j.ConsoleAppender - -# A1 uses PatternLayout. -log4j.appender.A1.layout=org.apache.log4j.PatternLayout -log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java deleted file mode 100644 index 233e50d6e..000000000 --- a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java +++ /dev/null @@ -1,45 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.bioschema; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; - -import org.apache.any23.Any23; -import org.apache.any23.extractor.ExtractionException; -import org.apache.any23.source.DocumentSource; -import org.apache.any23.source.StringDocumentSource; -import org.apache.any23.writer.NTriplesWriter; -import org.apache.any23.writer.TripleHandler; -import org.apache.any23.writer.TripleHandlerException; -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.output.ByteArrayOutputStream; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class Html2TriplesTest { - - static Logger logger = LoggerFactory.getLogger(Html2TriplesTest.class); - - @Test -// @Disabled - void conversionTest() throws Exception { - InputStream is = Html2TriplesTest.class.getResourceAsStream("/eu/dnetlib/dhp/bmuse/bioschema/ped.html"); - String page = IOUtils.toString(is, StandardCharsets.UTF_8.name()); - DocumentSource source = new StringDocumentSource(page, "https://proteinensemble.org/PED00001"); - Any23 runner = new Any23(); - try (ByteArrayOutputStream out = new ByteArrayOutputStream(); - TripleHandler handler = new NTriplesWriter(out);) { - runner.extract(source, handler); - logger.info(out.toString("UTF-8")); - } catch (ExtractionException e) { - logger.error("Cannot extract triples", e); - } catch (IOException e1) { - logger.error(" IO error whilst extracting triples", e1); - } catch (TripleHandlerException e2) { - logger.error("TripleHanderException", e2); - } - - } -} diff --git a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java deleted file mode 100644 index 2b87d069a..000000000 --- a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java +++ /dev/null @@ -1,24 +0,0 @@ - -package eu.dnetlib.dhp.bmuse.bioschema; - -import org.jsoup.select.Elements; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.bmuse.utils.UrlParser; - -public class SitemapTest { - - static Logger logger = LoggerFactory.getLogger(SitemapTest.class); - - @Test - @Disabled - void sitemapGzTest() throws Exception { - Elements urls = UrlParser.getSitemapList("https://disprot.org/sitemap2.xml.gz", "loc"); - urls.forEach(url -> { - logger.info(url.text()); - }); - } -} diff --git a/dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html b/dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html deleted file mode 100644 index e9ca13caa..000000000 --- a/dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html +++ /dev/null @@ -1,37 +0,0 @@ - - - PED - - - - - - - - - - - - - - - - - - - - -

PED00001 - Structural ensemble of pSic1 (1-90) with phosphorylations at Thr5, Thr33, Thr45, Ser69, Ser76, Ser80

Experiments' raw data bmrb:16659 link
Publication
Structure/function implications in a dynamic complex of the intrinsically disordered Sic1 with the Cdc4 subunit of an SCF ubiquitin ligase. Mittag T, Marsh J, Grishaev A, Orlicky S, Lin H, Sicheri F, Tyers M, Forman-Kay JD. Structure, 2010pubmed:20399186

NMR Experiments: All NMR data were collected on Varian Inova 500 MHz, 600 MHz and 800 MHz spectrometers at 5 ?C. The NMR samples were prepared in PBS and spectra were processed and analysed using NMRPipe/NMR Draw. Assignments and relaxation experiments were reported previously. NMR data is deposited in the BMRB with accession codes 16657 and 16659 for Sic1 and pSic1, respectively. PRE experiments and NH R2 NMR experiments were performed . The paramagnetic contribution to the transverse relaxation rate, (i.e. the paramagnetic relaxation enhancement, PRE), is the difference between transverse relaxation rates in paramagnetic and diamagnetic states. 1DHN RDCs were measured on 0.3 mM and 0.2 mM Sic1 and pSic1 samples, respectively. Couplings were extracted using ?Fuda: A function and data fitting and analysis package?. Errors were calculated from at least duplicate data sets. SAXS data collection: Small angle x-ray scattering data were acquired at the Beam Line 12-IDC at the Advanced Light Source synchrotron (Argonne National Laboratory, Argonne, IL). A total of 20 sequential data frames with exposure times of 0.25 seconds were recorded. Samples and buffers were flowing during data collection to prevent radiation damage. Individual data frames were converted from 2D to 1D profiles and normalized by the corresponding incident beam intensities. The final 1D scattering profiles and their uncertainties were then calculated as means and standard deviations over the 20 frames and then buffer data were subtracted from the sample data.

Ensemble models of intrinsically disordered Sic1 and pSic1 were calculated using essentially the same approach as was described (Marsh and Forman-Kay, 2009). Distance restraints were calculated from PRE measurements. SAXS profiles of the experimentally restrained ensembles were calculated by predicting scattering curves for each individual member using the program CRYSOL (Svergun et al., 1995) and averaged over the members of the ensemble. Chemical shifts were calculated from individual conformers using SHIFTX (Neal et al., 2003). RDCs were calculated using a local alignment approach, in which local alignment tensors are calculated for 15 residue fragments of the sequence in a sliding window fashion (Marsh et al., 2008). 15N R2 relaxation rates were compared to the number of heavy atoms in an 8 A ? radius of each measured nucleus, as previously described (Marsh and Forman-Kay, 2009). - The Sic1 and pSic1 ensemble models comprised residues 1-90 of the full-length Sic1 amino acid sequence plus an N-terminal Gly-Ser sequence remaining after tag cleavage. Glutamate residues were used to represent the phosphorylated residues in pSic1 to facilitate use of TraDES (Feldman and Hogue, 2000). These glutamate residues were converted to the proper phosphorylated threonine or serine residues for electrostatic calculations. Three independent ensembles were calculated for each of free Sic1 and pSic1 and the pSic1 complex. Calculations were performed on a cluster of CPUs, with one main node performing the core conformational selection calculations and 8-12 nodes performing the iterative conformational sampling with CNS (Bru? nger et al., 1998), Unfoldtraj, and TraDES (Feldman and Hogue, 2000). The initial temperature for the ENSEMBLE calculations was set to 10,000 and decreased to 0.01 in 200,000 steps. The starting ensembles contained 200 structures and the number of conformers comprising the ensembles was decreased by one after each successful ENSEMBLE calculation in which full agreement with experimental restraints was achieved. Calculations were stopped when a smaller ensemble could not be successfully calculated within 72 hr.

Cross reference disprot:DP00631 link
102030405060708090P38634 GSMTPSTPPRSRGTRYLAQPSGNTSSSALMQGQKTPQKPSQNLVPVTPSTTKSFKNAPLLAPPNSNMGMTSPFNGLTSPQRSPFPKSSVKRTChain APTMsPED00001e001 Secondary structure entropyPED00001e001 Relative ASA PED00001e002 Secondary structure entropyPED00001e002 Relative ASA PED00001e003 Secondary structure entropyPED00001e003 Relative ASA

Deposited ensemble

Ensemble ID PED00001e001 Number of models 11

Chain: A
Secondary structure entropy 0.40Relative solvent accessibility 0.68 Radius of gyration 26.74
Parsing... [0/1481071]

Ensemble ID PED00001e002 Number of models 10

Chain: A
Secondary structure entropy 0.39Relative solvent accessibility 0.70 Radius of gyration 26.71
Parsing... [1485862/1485862]

Ensemble ID PED00001e003 Number of models 11

Chain: A
Secondary structure entropy 0.44Relative solvent accessibility 0.70 Radius of gyration 28.15
Model 1 / 10
- - - \ No newline at end of file diff --git a/dhp-workflows/dhp-rdfconverter/pom.xml b/dhp-workflows/dhp-rdfconverter/pom.xml index 77d48a1e9..a71ee8f33 100644 --- a/dhp-workflows/dhp-rdfconverter/pom.xml +++ b/dhp-workflows/dhp-rdfconverter/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.4-SNAPSHOT + 1.2.5-SNAPSHOT dhp-rdfconverter