diff --git a/dhp-workflows/dhp-bmuse/pom.xml b/dhp-workflows/dhp-bmuse/pom.xml new file mode 100644 index 000000000..b34a0934b --- /dev/null +++ b/dhp-workflows/dhp-bmuse/pom.xml @@ -0,0 +1,42 @@ + + + 4.0.0 + + eu.dnetlib.dhp + dhp-workflows + 1.2.4-SNAPSHOT + + dhp-bmuse + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + hwu.elixir + bmuse-core + 0.5.3 + + + com.google.guava + guava + 22.0 + + + com.squareup.okhttp3 + okhttp + 3.11.0 + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java new file mode 100644 index 000000000..28ad1ee0a --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java @@ -0,0 +1,112 @@ + +package eu.dnetlib.dhp.bmuse.bioschema; + +import java.nio.charset.Charset; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Stream; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.util.LongAccumulator; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper; +import eu.dnetlib.dhp.bmuse.utils.UrlParser; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class ScrapingJob { + + static Logger logger = LoggerFactory.getLogger(ScrapingJob.class); + private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z"); + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + ScrapingJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json"))); + parser.parseArgument(args); + + final String nameNode = parser.get("nameNode"); + final String workingPath = parser.get("workingPath"); + final String rdfOutput = parser.get("rdfOutput"); + final String sitemapUrl = parser.get("sitemapUrl"); + final String sitemapURLKey = parser.get("sitemapURLKey"); + final String dynamic = parser.get("dynamic"); + final String maxScrapedPages = parser.get("maxScrapedPages"); + Boolean dynamicValue = true; + if (Objects.nonNull(dynamic)) { + dynamicValue = Boolean.parseBoolean(dynamic); + } + final boolean scrapingType = dynamicValue.booleanValue(); + + AtomicLong scraped = new AtomicLong(0l); + AtomicLong errors = new AtomicLong(0l); + + logger + .info( + "*************************** STARTING SCRAPE: " + + formatter.format(new Date(System.currentTimeMillis()))); + logger.info("Default charset: " + Charset.defaultCharset()); + + BMUSEScraper scraper = new BMUSEScraper(); + String url = sitemapUrl.toLowerCase(); + Elements urls = UrlParser.getSitemapList(url, sitemapURLKey); + long total = urls.size(); + + Path output = new Path( + nameNode + .concat(workingPath) + .concat(rdfOutput)); + Configuration conf = DHPUtils.getHadoopConfiguration(nameNode); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(output), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class), + SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { + Stream urlStream = null; + if (Objects.nonNull(maxScrapedPages)) { + urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages)); + } else { + urlStream = urls.stream(); + } + urlStream.forEach(u -> { + try { + final Text key = new Text(u.text()); + final Text value = new Text(scraper.scrapeUrl(u.text(), scrapingType)); + writer.append(key, value); + scraped.getAndIncrement(); + } catch (Exception e) { + logger.error(u.text(), e); + errors.getAndIncrement(); + } + }); + } + + logger + .info( + "*************************** ENDING SCRAPE: " + formatter.format(new Date(System.currentTimeMillis()))); + logger + .info( + "Total pages to scrape: " + total + " Scraped: " + scraped.get() + + " Errors: " + errors.get()); + } +} diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java new file mode 100644 index 000000000..1e58503fa --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java @@ -0,0 +1,85 @@ + +package eu.dnetlib.dhp.bmuse.utils; + +import java.io.StringWriter; + +import org.apache.any23.source.DocumentSource; +import org.apache.any23.source.StringDocumentSource; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Model; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.Rio; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import hwu.elixir.scrape.exceptions.*; +import hwu.elixir.scrape.scraper.ScraperFilteredCore; + +public class BMUSEScraper extends ScraperFilteredCore { + + private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName()); + + public String scrapeUrl(String url, Boolean dynamic) + throws MissingMarkupException, FourZeroFourException { + url = fixURL(url); + + String html = ""; + // The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information + // (dynamic and static respectively) + + if (dynamic) { + html = wrapHTMLExtraction(url); + } else { + html = wrapHTMLExtractionStatic(url); + } + + if (html == null || html.contentEquals("")) + return new String("empty html"); + if (logger.isTraceEnabled()) { + logger.trace("Read following html =============================================================="); + logger.trace(html); + } + + try { + html = injectId(html, url); + if (logger.isTraceEnabled()) { + logger + .trace( + "Same HTML after injecting ID =============================================================="); + logger.trace(html); + } + } catch (MissingHTMLException | JsonLDInspectionException e) { + logger.error(e.toString()); + return e.getMessage(); + } + + DocumentSource source = new StringDocumentSource(html, url); + IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI()); + + String n3 = getTriplesInNTriples(source); + if (n3 == null) + throw new MissingMarkupException(url); + + Model updatedModel = null; + try { + updatedModel = processTriples(n3, sourceIRI, 0l); + } catch (NTriplesParsingException e1) { + logger + .error( + "Failed to process triples into model; the NTriples generated from the URL (" + url + + ") could not be parsed into a model."); + return e1.getMessage(); + } + if (updatedModel == null) + return new String("rdf model null"); + + try (StringWriter jsonLDWriter = new StringWriter()) { + Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS); + return jsonLDWriter.toString(); + } catch (Exception e) { + logger.error("Problem writing jsonld for " + url, e); + return e.getMessage(); + } + } +} diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java new file mode 100644 index 000000000..c1c626b69 --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java @@ -0,0 +1,65 @@ + +package eu.dnetlib.dhp.bmuse.utils; + +import java.io.IOException; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import hwu.elixir.utils.Helpers; + +public class UrlParser { + + private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName()); + + public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException { + + Document doc = new Document(url); + Document urlSitemapListsNested; + Elements elements = new Elements(); + Elements sitemaps = new Elements(); + boolean sitemapindex = false; + boolean urlset = false; + + try { + int urlLength = url.length(); + logger.info("parse sitemap list"); + String sitemapExt = url.substring(urlLength - 3, urlLength); + if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending + logger.info("compressed sitemap"); + byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes(); + doc = Helpers.gzipFileDecompression(bytes); + } else { + doc = Jsoup.connect(url).maxBodySize(0).get(); + } + + } catch (IOException e) { + logger.error("Jsoup parsing exception: " + e.getMessage()); + } + + try { + + elements = doc.select(sitemapURLKey); + + // check the html if it is a sitemapindex or a urlset + sitemapindex = doc.outerHtml().contains("sitemapindex"); + urlset = doc.outerHtml().contains("urlset"); + } catch (NullPointerException e) { + logger.error(e.getMessage()); + } + + if (sitemapindex) { + // if sitemapindex get the loc of all the sitemaps + // added warning for sitemap index files + logger + .warn( + "please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead"); + sitemaps = doc.select(sitemapURLKey); + } + + return elements; + } +} diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json new file mode 100644 index 000000000..1ac0b50de --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json @@ -0,0 +1,44 @@ +[ + { + "paramName": "n", + "paramLongName": "nameNode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working path", + "paramRequired": true + }, + { + "paramName": "r", + "paramLongName": "rdfOutput", + "paramDescription": "the working path", + "paramRequired": true + }, + { + "paramName": "u", + "paramLongName": "sitemapUrl", + "paramDescription": "the sitemap url", + "paramRequired": true + }, + { + "paramName": "k", + "paramLongName": "sitemapURLKey", + "paramDescription": "the sitemap file contains a list of xml entries, each one has a tag identified with sitemapURLKey with the url as value", + "paramRequired": true + }, + { + "paramName": "d", + "paramLongName": "dynamic", + "paramDescription": "the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)", + "paramRequired": false + }, + { + "paramName": "m", + "paramLongName": "maxScrapedPages", + "paramDescription": "max number of pages that will be scraped, default: no limit", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml new file mode 100644 index 000000000..7b13aab55 --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + jobTracker + yarn + + + nameNode + hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 + + + hive_metastore_uris + thrift://hadoop-edge3.garr-pa1.d4science.org:9083 + + + spark2YarnHistoryServerAddress + http://hadoop-rm2.garr-pa1.d4science.org:19888 + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + "com.cloudera.spark.lineage.NavigatorAppListener" + + + spark2SqlQueryExecutionListeners + "com.cloudera.spark.lineage.NavigatorQueryListener" + + \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml new file mode 100644 index 000000000..705396653 --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml @@ -0,0 +1,98 @@ + + + + workingPath + /data/bioschema/disprot/ + the working path + + + rdfOutput + nquads.seq + rdf output of scraping step + + + sitemapUrl + https://disprot.org/sitemap2.xml.gz + + + sitemapURLKey + loc + + + dynamic + true + the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively) + + + maxScrapedPages + 100 + max number of pages that will be scraped, default: no limit + + + oozie.launcher.mapreduce.map.java.opts + -Xmx4g + + + spark2RdfConversionMaxExecutors + 50 + + + sparkDriverMemory + 7G + memory for driver process + + + sparkExecutorMemory + 2G + memory for individual executor + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + ${nameNode} + eu.dnetlib.dhp.bmuse.bioschema.ScrapingJob + --nameNode${nameNode} + --workingPath${workingPath} + --rdfOutput${rdfOutput} + --sitemapUrl${sitemapUrl} + --sitemapURLKey${sitemapURLKey} + --dynamic${dynamic} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties b/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties new file mode 100644 index 000000000..26f94f2df --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties @@ -0,0 +1,4 @@ +maxLimitScrape=200000 +schemaContext=https\://schema.org/docs/jsonldcontext.jsonld +dynamic=true +chromiumDriverLocation=/bin/chromedriver \ No newline at end of file diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties b/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties new file mode 100644 index 000000000..63cba917e --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties @@ -0,0 +1,9 @@ +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=INFO, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java new file mode 100644 index 000000000..2b87d069a --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java @@ -0,0 +1,24 @@ + +package eu.dnetlib.dhp.bmuse.bioschema; + +import org.jsoup.select.Elements; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.bmuse.utils.UrlParser; + +public class SitemapTest { + + static Logger logger = LoggerFactory.getLogger(SitemapTest.class); + + @Test + @Disabled + void sitemapGzTest() throws Exception { + Elements urls = UrlParser.getSitemapList("https://disprot.org/sitemap2.xml.gz", "loc"); + urls.forEach(url -> { + logger.info(url.text()); + }); + } +} diff --git a/dhp-workflows/dhp-rdfconverter/pom.xml b/dhp-workflows/dhp-rdfconverter/pom.xml new file mode 100644 index 000000000..77d48a1e9 --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/pom.xml @@ -0,0 +1,42 @@ + + + 4.0.0 + + eu.dnetlib.dhp + dhp-workflows + 1.2.4-SNAPSHOT + + dhp-rdfconverter + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + org.apache.any23 + apache-any23-core + 2.3 + + + org.eclipse.rdf4j + rdf4j-rio-rdfxml + 2.5.4 + + + org.eclipse.rdf4j + rdf4j-model + 2.5.4 + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/SparkRdfToDatacite.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/SparkRdfToDatacite.java new file mode 100644 index 000000000..057492dab --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/SparkRdfToDatacite.java @@ -0,0 +1,71 @@ + +package eu.dnetlib.dhp.rdfconverter.bioschema; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Objects; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.rdfconverter.utils.RDFConverter; + +public class SparkRdfToDatacite { + + static Logger logger = LoggerFactory.getLogger(SparkRdfToDatacite.class); + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkRdfToDatacite.class + .getResourceAsStream( + "/eu/dnetlib/dhp/rdfconverter/bioschema/generate_dataset.json"))); + parser.parseArgument(args); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + final String workingPath = parser.get("workingPath"); + final String rdfNquadsRecords = parser.get("rdfInput"); + final String output = parser.get("output"); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + String rdfNquadsRecordsPath = workingPath.concat(rdfNquadsRecords); + JavaPairRDD rdfNquadsRecordsRDD = sc + .sequenceFile(rdfNquadsRecordsPath, Text.class, Text.class); + logger.info("Rdf nquads records retrieved: {}", rdfNquadsRecordsRDD.count()); + + JavaRDD proteins = rdfNquadsRecordsRDD.flatMap(nquads -> { + RDFConverter converter = new RDFConverter(); + ArrayList jsonlds = null; + try { + jsonlds = converter.nQuadsFile2DataciteJson(nquads._2().toString()); + } catch (Exception e) { + logger.error(nquads._1().toString(), e); + return Arrays.asList(new String()).iterator(); + } + return jsonlds.iterator(); + }).filter(Objects::nonNull).filter(jsonld -> !jsonld.isEmpty()).map(jsonld -> new Text(jsonld)); + logger.info("json datacite generated: {}", proteins.count()); + proteins.saveAsTextFile(workingPath.concat(output), GzipCodec.class); + }); + } +} diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java new file mode 100644 index 000000000..ffac36459 --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java @@ -0,0 +1,384 @@ + +package eu.dnetlib.dhp.rdfconverter.bioschema.model; + +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class BioSchemaProtein { + @JsonProperty("@id") + private String id; + @JsonProperty("@graph") + private List entryList; + @JsonProperty("http://purl.org/pav/retrievedOn") + private DateTimeType retrievedOn; + @JsonProperty("citation") + private Citation citation; + + public static class Entry { + @JsonProperty("@id") + private String id; + @JsonProperty("@type") + private List type; + @JsonProperty("https://schema.org/identifier") + private String identifier; + @JsonProperty("https://schema.org/name") + private String name; + @JsonProperty("associatedDisease") + private List associatedDisease; + @JsonProperty("description") + private String description; + @JsonProperty("isEncodedByBioChemEntity") + private String isEncodedByBioChemEntity; + @JsonProperty("url") + private String url; + @JsonProperty("alternateName") + private String alternateName; + @JsonProperty("bioChemInteraction") + private List bioChemInteraction; + @JsonProperty("bioChemSimilarity") + private List bioChemSimilarity; + @JsonProperty("hasMolecularFunction") + private String hasMolecularFunction; + @JsonProperty("image") + private String image; + @JsonProperty("isInvolvedInBiologicalProcess") + private String isInvolvedInBiologicalProcess; + @JsonProperty("isPartOfBioChemEntity") + private IsPartOfBioChemEntity isPartOfBioChemEntity; + @JsonProperty("mainEntityOfPage") + private Link mainEntityOfPage; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getType() { + return type; + } + + public void setType(List type) { + this.type = type; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + @JsonProperty("https://schema.org/sameAs") + private List sameAs; + + public List getSameAs() { + return sameAs; + } + + public void setSameAs(List sameAs) { + this.sameAs = sameAs; + } + + public String getIdentifier() { + return identifier; + } + + public void setIdentifier(String identifier) { + this.identifier = identifier; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getIsEncodedByBioChemEntity() { + return isEncodedByBioChemEntity; + } + + public void setIsEncodedByBioChemEntity(String isEncodedByBioChemEntity) { + this.isEncodedByBioChemEntity = isEncodedByBioChemEntity; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getAlternateName() { + return alternateName; + } + + public void setAlternateName(String alternateName) { + this.alternateName = alternateName; + } + + public List getBioChemInteraction() { + return bioChemInteraction; + } + + public void setBioChemInteraction(List bioChemInteraction) { + this.bioChemInteraction = bioChemInteraction; + } + + public List getBioChemSimilarity() { + return bioChemSimilarity; + } + + public void setBioChemSimilarity(List bioChemSimilarity) { + this.bioChemSimilarity = bioChemSimilarity; + } + + public String getHasMolecularFunction() { + return hasMolecularFunction; + } + + public void setHasMolecularFunction(String hasMolecularFunction) { + this.hasMolecularFunction = hasMolecularFunction; + } + + public String getImage() { + return image; + } + + public void setImage(String image) { + this.image = image; + } + + public String getIsInvolvedInBiologicalProcess() { + return isInvolvedInBiologicalProcess; + } + + public void setIsInvolvedInBiologicalProcess(String isInvolvedInBiologicalProcess) { + this.isInvolvedInBiologicalProcess = isInvolvedInBiologicalProcess; + } + + public List getAssociatedDisease() { + return associatedDisease; + } + + public void setAssociatedDisease(List associatedDisease) { + this.associatedDisease = associatedDisease; + } + + public IsPartOfBioChemEntity getIsPartOfBioChemEntity() { + return isPartOfBioChemEntity; + } + + public void setIsPartOfBioChemEntity(IsPartOfBioChemEntity isPartOfBioChemEntity) { + this.isPartOfBioChemEntity = isPartOfBioChemEntity; + } + + public Link getMainEntityOfPage() { + return mainEntityOfPage; + } + + public void setMainEntityOfPage(Link mainEntityOfPage) { + this.mainEntityOfPage = mainEntityOfPage; + } + + } + + public static class IsPartOfBioChemEntity { + @JsonProperty("@type") + private String type; + @JsonProperty("url") + private String url; + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + } + + public static class AssociatedDisease { + @JsonProperty("@type") + private String type; + @JsonProperty("name") + private String name; + @JsonProperty("code") + private DeseaseCode code; + @JsonProperty("id") + private String id; + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public DeseaseCode getCode() { + return code; + } + + public void setCode(DeseaseCode code) { + this.code = code; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + } + + public static class DeseaseCode { + @JsonProperty("@type") + private String type; + @JsonProperty("codeValue") + private String codeValue; + @JsonProperty("codingSystem") + private String codingSystem; + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getCodeValue() { + return codeValue; + } + + public void setCodeValue(String codeValue) { + this.codeValue = codeValue; + } + + public String getCodingSystem() { + return codingSystem; + } + + public void setCodingSystem(String codingSystem) { + this.codingSystem = codingSystem; + } + } + + public static class Link { + @JsonProperty("@id") + private String id; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + } + + public static class DateTimeType { + @JsonProperty("@type") + private String type; + @JsonProperty("@value") + private String value; + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + } + + public static class Citation { + @JsonProperty("@type") + private String type; + @JsonProperty("@id") + private String id; + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getEntryList() { + return entryList; + } + + public void setEntryList(List entryList) { + this.entryList = entryList; + } + + public DateTimeType getRetrievedOn() { + return retrievedOn; + } + + public void setRetrievedOn(DateTimeType retrievedOn) { + this.retrievedOn = retrievedOn; + } + + public Citation getCitation() { + return citation; + } + + public void setCitation(Citation citation) { + this.citation = citation; + } +} diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java new file mode 100644 index 000000000..6fe0963e2 --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java @@ -0,0 +1,291 @@ + +package eu.dnetlib.dhp.rdfconverter.bioschema.model; + +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonInclude; + +@JsonInclude(JsonInclude.Include.NON_NULL) +public class DataciteProtein { + private String id; + private String doi; + private Types types; + List creators = new ArrayList(); + private String publisher; + private String publicationYear; + private static final String schemaVersion = "http://datacite.org/schema/kernel-4"; + List identifiers = new ArrayList(); + List relatedIdentifiers = new ArrayList(); + List alternateIdentifiers = new ArrayList(); + List descriptions = new ArrayList(); + List titles = new ArrayList<Title>(); + private List<DataciteDate> dates = new ArrayList<DataciteDate>(); + + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class Types { + private String resourceType; + private String resourceTypeGeneral; + + public String getResourceType() { + return resourceType; + } + + public void setResourceType(String resourceType) { + this.resourceType = resourceType; + } + + public String getResourceTypeGeneral() { + return resourceTypeGeneral; + } + + public void setResourceTypeGeneral(String resourceTypeGeneral) { + this.resourceTypeGeneral = resourceTypeGeneral; + } + } + + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class Creators { + } + + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class Identifier { + private String identifier; + private String identifierType; + + public String getIdentifier() { + return identifier; + } + + public void setIdentifier(String identifier) { + this.identifier = identifier; + } + + public String getIdentifierType() { + return identifierType; + } + + public void setIdentifierType(String identifierType) { + this.identifierType = identifierType; + } + } + + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class RelatedIdentifier { + private String relationType; + private String relatedIdentifier; + private String relatedIdentifierType; + + public String getRelationType() { + return relationType; + } + + public void setRelationType(String relationType) { + this.relationType = relationType; + } + + public String getRelatedIdentifier() { + return relatedIdentifier; + } + + public void setRelatedIdentifier(String relatedIdentifier) { + this.relatedIdentifier = relatedIdentifier; + } + + public String getRelatedIdentifierType() { + return relatedIdentifierType; + } + + public void setRelatedIdentifierType(String relatedIdentifierType) { + this.relatedIdentifierType = relatedIdentifierType; + } + } + + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class AlternateIdentifier { + private String alternateIdentifier; + private String alternateIdentifierType; + + public String getAlternateIdentifier() { + return alternateIdentifier; + } + + public void setAlternateIdentifier(String alternateIdentifier) { + this.alternateIdentifier = alternateIdentifier; + } + + public String getAlternateIdentifierType() { + return alternateIdentifierType; + } + + public void setAlternateIdentifierType(String alternateIdentifierType) { + this.alternateIdentifierType = alternateIdentifierType; + } + } + + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class Description { + private String description; + private String descriptionType; + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getDescriptionType() { + return descriptionType; + } + + public void setDescriptionType(String descriptionType) { + this.descriptionType = descriptionType; + } + } + + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class Title { + private String title; + private String titleType; + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getTitleType() { + return titleType; + } + + public void setTitleType(String titleType) { + this.titleType = titleType; + } + } + + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class DataciteDate { + private String date; + private String dateType; + + public String getDate() { + return date; + } + + public void setDate(String date) { + this.date = date; + } + + public String getDateType() { + return dateType; + } + + public void setDateType(String dateType) { + this.dateType = dateType; + } + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getDoi() { + return doi; + } + + public void setDoi(String doi) { + this.doi = doi; + } + + public Types getTypes() { + return types; + } + + public void setTypes(Types types) { + this.types = types; + } + + public List<Creators> getCreators() { + return creators; + } + + public void setCreators(List<Creators> creators) { + this.creators = creators; + } + + public String getPublisher() { + return publisher; + } + + public void setPublisher(String publisher) { + this.publisher = publisher; + } + + public String getPublicationYear() { + return publicationYear; + } + + public void setPublicationYear(String publicationYear) { + this.publicationYear = publicationYear; + } + + public static String getSchemaVersion() { + return schemaVersion; + } + + public List<RelatedIdentifier> getRelatedIdentifiers() { + return relatedIdentifiers; + } + + public void setRelatedIdentifiers(List<RelatedIdentifier> relatedIdentifiers) { + this.relatedIdentifiers = relatedIdentifiers; + } + + public List<AlternateIdentifier> getAlternateIdentifiers() { + return alternateIdentifiers; + } + + public void setAlternateIdentifiers(List<AlternateIdentifier> alternateIdentifiers) { + this.alternateIdentifiers = alternateIdentifiers; + } + + public List<Description> getDescriptions() { + return descriptions; + } + + public void setDescriptions(List<Description> descriptions) { + this.descriptions = descriptions; + } + + public List<Title> getTitles() { + return titles; + } + + public void setTitles(List<Title> titles) { + this.titles = titles; + } + + public List<Identifier> getIdentifiers() { + return identifiers; + } + + public void setIdentifiers(List<Identifier> identifiers) { + this.identifiers = identifiers; + } + + public List<DataciteDate> getDates() { + return dates; + } + + public void setDates(List<DataciteDate> dates) { + this.dates = dates; + } +} diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java new file mode 100644 index 000000000..08e4a03ad --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java @@ -0,0 +1,197 @@ + +package eu.dnetlib.dhp.rdfconverter.utils; + +import java.io.StringReader; +import java.io.StringWriter; +import java.util.*; + +import org.eclipse.rdf4j.model.Model; +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.RDFWriter; +import org.eclipse.rdf4j.rio.Rio; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.jsonldjava.core.JsonLdOptions; +import com.github.jsonldjava.core.JsonLdProcessor; +import com.github.jsonldjava.utils.JsonUtils; + +import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein; +import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein; + +public class RDFConverter { + + private static final Logger log = LoggerFactory.getLogger(RDFConverter.class); + + public ArrayList<String> nQuadsFile2DataciteJson(String nquads) throws Exception { + StringReader reader = new StringReader(nquads); + Model model = Rio.parse(reader, "", RDFFormat.NQUADS); + StringWriter jsonLDWriter = new StringWriter(); + RDFWriter rdfRecordWriter = Rio.createWriter(RDFFormat.JSONLD, jsonLDWriter); + Rio.write(model, rdfRecordWriter); + String jsonLDBuffer = jsonLDWriter.toString(); + Object jsonObject = JsonUtils.fromString(jsonLDBuffer); + Object compact = JsonLdProcessor.compact(jsonObject, new HashMap<>(), new JsonLdOptions()); + String compactContent = JsonUtils.toString(compact); + + ObjectMapper objectMapper = new ObjectMapper(); + objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY); + objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT); + objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class); + log.debug("BioSchema id: " + bioSchemaProtein.getId()); + BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn(); + BioSchemaProtein.Citation citation = bioSchemaProtein.getCitation(); + + ArrayList<String> results = new ArrayList<String>(); + bioSchemaProtein.getEntryList().stream().forEach(entry -> { + + if (entry.getType() != null + && entry.getType().stream().filter(type -> type.equals("https://schema.org/Protein")).count() == 1) { + + DataciteProtein dataciteProtein = new DataciteProtein(); + if (citation != null) { + addRelatedIdentifier(dataciteProtein, citation.getId(), "CitedBy"); + } + + DataciteProtein.Types types = new DataciteProtein.Types(); + types.setResourceType("Protein"); + types.setResourceTypeGeneral("Dataset"); + dataciteProtein.setTypes(types); + + DataciteProtein.DataciteDate dataciteDate = new DataciteProtein.DataciteDate(); + dataciteDate.setDate(retrievedOnType.getValue()); + dataciteDate.setDateType("Collected"); + dataciteProtein.getDates().add(dataciteDate); + + if (entry.getName() != null) { + log.debug("Name: " + entry.getName()); + DataciteProtein.Title title = new DataciteProtein.Title(); + title.setTitle(entry.getName()); + dataciteProtein.getTitles().add(title); + } + DataciteProtein.Identifier identifier = new DataciteProtein.Identifier(); + log.debug("Id: " + entry.getId()); + identifier.setIdentifier(entry.getId()); + identifier.setIdentifierType("URL"); + dataciteProtein.getIdentifiers().add(identifier); + + if (entry.getIdentifier() != null) { + log.debug("Identifier: " + entry.getIdentifier()); + addAlternateIdentifier(dataciteProtein, entry.getIdentifier()); + } + + if (entry.getDescription() != null) { + log.debug("description: " + entry.getDescription()); + DataciteProtein.Description description = new DataciteProtein.Description(); + description.setDescription(entry.getDescription()); + dataciteProtein.getDescriptions().add(description); + } + + if (entry.getIsEncodedByBioChemEntity() != null) { + log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity()); + addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), ""); + } + + if (entry.getUrl() != null) { + log.debug("url: " + entry.getUrl()); + addAlternateIdentifier(dataciteProtein, entry.getUrl()); + } + + if (entry.getAlternateName() != null) { + log.debug("alternateName: " + entry.getAlternateName()); + DataciteProtein.Title title = new DataciteProtein.Title(); + title.setTitle(entry.getAlternateName()); + title.setTitleType("AlternativeTitle"); + dataciteProtein.getTitles().add(title); + } + + if (entry.getBioChemInteraction() != null) { + entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> { + log.debug("bioChemInteraction: " + bc.getId()); + addRelatedIdentifier(dataciteProtein, bc.getId(), ""); + }); + } + + if (entry.getBioChemSimilarity() != null) { + entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> { + log.debug("bioChemSimilarity: " + bc.getId()); + addRelatedIdentifier(dataciteProtein, bc.getId(), ""); + }); + } + + if (entry.getHasMolecularFunction() != null) { + log.debug("hasMolecularFunction: " + entry.getHasMolecularFunction()); + addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), ""); + } + + if (entry.getIsInvolvedInBiologicalProcess() != null) { + log.debug("isInvolvedInBiologicalProcess: " + entry.getIsInvolvedInBiologicalProcess()); + addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), ""); + } + + if (entry.getIsEncodedByBioChemEntity() != null) { + log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity()); + addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), ""); + } + + if (entry.getIsPartOfBioChemEntity() != null) { + log.debug("isPartOfBioChemEntity: " + entry.getIsPartOfBioChemEntity()); + addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), ""); + } + + if (entry.getSameAs() != null) { + entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> { + log.debug("sameAs: " + sameAs.getId()); + addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo"); + }); + } + + if (entry.getAssociatedDisease() != null) { + entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> { + log.debug("associated disease: " + ad.getName()); + addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo"); + }); + } + + String proteinId = ""; + try { + String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/"); + proteinId = identifierParts[identifierParts.length - 1]; + } catch (Exception e) { + log.error("Identifier not found", e.getMessage()); + } + + dataciteProtein.setId(proteinId); + + ObjectMapper mapper = new ObjectMapper(); + try { + StringWriter writer = new StringWriter(); + mapper.writeValue(writer, dataciteProtein); + results.add(writer.toString()); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + }); + return results; + } + + private void addRelatedIdentifier(DataciteProtein DataciteProtein, String relatedIdentifierValue, + String relationType) { + DataciteProtein.RelatedIdentifier relatedIdentifier = new DataciteProtein.RelatedIdentifier(); + relatedIdentifier.setRelatedIdentifier(relatedIdentifierValue); + if (!relationType.isEmpty()) { + relatedIdentifier.setRelationType(relationType); + } + DataciteProtein.getRelatedIdentifiers().add(relatedIdentifier); + } + + private void addAlternateIdentifier(DataciteProtein DataciteProtein, String alternateIdentifierValue) { + DataciteProtein.AlternateIdentifier alternateIdentifier = new DataciteProtein.AlternateIdentifier(); + alternateIdentifier.setAlternateIdentifier(alternateIdentifierValue); + DataciteProtein.getAlternateIdentifiers().add(alternateIdentifier); + } +} diff --git a/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/generate_dataset.json b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/generate_dataset.json new file mode 100644 index 000000000..4038c275f --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/generate_dataset.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "n", + "paramLongName": "nameNode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working path", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "rdfInput", + "paramDescription": "sequence file inside working path that contains rdf records", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "output", + "paramDescription": "relative path inside workingpath where bioschema dataset in datacite format will be stored", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/config-default.xml b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/config-default.xml new file mode 100644 index 000000000..7b13aab55 --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/config-default.xml @@ -0,0 +1,68 @@ +<configuration> + + <!-- OCEAN --> + +<!-- <property>--> +<!-- <name>jobTracker</name>--> +<!-- <value>yarnRM</value>--> +<!-- </property>--> +<!-- <property>--> +<!-- <name>nameNode</name>--> +<!-- <value>hdfs://nameservice1</value>--> +<!-- </property>--> +<!-- <property>--> +<!-- <name>hive_metastore_uris</name>--> +<!-- <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>--> +<!-- </property>--> +<!-- <property>--> +<!-- <name>spark2YarnHistoryServerAddress</name>--> +<!-- <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>--> +<!-- </property>--> + + + <!-- GARR --> + + <property> + <name>jobTracker</name> + <value>yarn</value> + </property> + <property> + <name>nameNode</name> + <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value> + </property> + <property> + <name>hive_metastore_uris</name> + <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value> + </property> + <property> + <name>spark2YarnHistoryServerAddress</name> + <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value> + </property> + + + <property> + <name>oozie.launcher.mapreduce.user.classpath.first</name> + <value>true</value> + </property> + + <property> + <name>oozie.use.system.libpath</name> + <value>true</value> + </property> + <property> + <name>oozie.action.sharelib.for.spark</name> + <value>spark2</value> + </property> + <property> + <name>spark2EventLogDir</name> + <value>/user/spark/spark2ApplicationHistory</value> + </property> + <property> + <name>spark2ExtraListeners</name> + <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value> + </property> + <property> + <name>spark2SqlQueryExecutionListeners</name> + <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value> + </property> +</configuration> \ No newline at end of file diff --git a/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/workflow.xml b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/workflow.xml new file mode 100644 index 000000000..0a7a2495f --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/workflow.xml @@ -0,0 +1,94 @@ +<workflow-app name="RdfConverter" xmlns="uri:oozie:workflow:0.5"> + <parameters> + <property> + <name>workingPath</name> + <value>/data/bioschema/disprot/</value> + <description>the working path</description> + </property> + <property> + <name>rdfInput</name> + <value>nquads.seq</value> + <description>rdf output of scraping workflow</description> + </property> + <property> + <name>output</name> + <value>json-datacite/</value> + </property> + <property> + <name>oozie.launcher.mapreduce.map.java.opts</name> + <value>-Xmx4g</value> + </property> + <property> + <name>spark2RdfConversionMaxExecutors</name> + <value>50</value> + </property> + <property> + <name>sparkDriverMemory</name> + <value>7G</value> + <description>memory for driver process</description> + </property> + <property> + <name>sparkExecutorMemory</name> + <value>2G</value> + <description>memory for individual executor</description> + </property> + <property> + <name>spark2ExtraListeners</name> + <value>com.cloudera.spark.lineage.NavigatorAppListener</value> + <description>spark 2.* extra listeners classname</description> + </property> + <property> + <name>spark2YarnHistoryServerAddress</name> + <description>spark 2.* yarn history server address</description> + </property> + <property> + <name>spark2EventLogDir</name> + <description>spark 2.* event log dir location</description> + </property> + </parameters> + + <global> + <job-tracker>${jobTracker}</job-tracker> + <name-node>${nameNode}</name-node> + </global> + + <start to="ResetWorkingPath"/> + <kill name="Kill"> + <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> + </kill> + + <action name="ResetWorkingPath"> + <fs> + <delete path='${workingPath}${output}'/> + </fs> + <ok to="NquadsToDataciteJson"/> + <error to="Kill"/> + </action> + + <action name="NquadsToDataciteJson"> + <spark xmlns="uri:oozie:spark-action:0.2"> + <master>yarn-cluster</master> + <mode>cluster</mode> + <name>NquadsToDataciteJson</name> + <class>eu.dnetlib.dhp.rdfconverter.bioschema.SparkRdfToDatacite</class> + <jar>dhp-rdfconverter-${projectVersion}.jar</jar> + <spark-opts> + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2RdfConversionMaxExecutors} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + </spark-opts> + <arg>--nameNode</arg><arg>${nameNode}</arg> + <arg>--workingPath</arg><arg>${workingPath}</arg> + <arg>--rdfInput</arg><arg>${rdfInput}</arg> + <arg>--output</arg><arg>${output}</arg> + </spark> + <ok to="End"/> + <error to="Kill"/> + </action> + + <end name="End"/> +</workflow-app> \ No newline at end of file diff --git a/dhp-workflows/dhp-rdfconverter/src/main/resources/log4j.properties b/dhp-workflows/dhp-rdfconverter/src/main/resources/log4j.properties new file mode 100644 index 000000000..63cba917e --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/log4j.properties @@ -0,0 +1,9 @@ +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=INFO, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java new file mode 100644 index 000000000..e74c17352 --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java @@ -0,0 +1,31 @@ + +package eu.dnetlib.dhp.rdfconverter.bioschema; + +import java.io.InputStream; +import java.util.ArrayList; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.rdfconverter.utils.RDFConverter; + +public class ConverterTest { + + static Logger logger = LoggerFactory.getLogger(ConverterTest.class); + + @Test +// @Disabled + public void nqToDataciteTest() throws Exception { + InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq"); + String nq = IOUtils.toString(is); + logger.info("NQ: " + nq); + RDFConverter converter = new RDFConverter(); + ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq); + if (results != null && !results.isEmpty()) { + logger.info("JSON DATACITE: " + results.get(0)); + } + } +} diff --git a/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq b/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq new file mode 100644 index 000000000..f26a4b1d9 --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq @@ -0,0 +1,52 @@ +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> <http://purl.org/pav/retrievedFrom> <https://disprot.org/DP01454> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> <http://purl.org/pav/retrievedOn> "2021-11-25T12:23:57"^^<http://www.w3.org/2001/XMLSchema#dateTime> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> <http://purl.org/pav/createdWith> <https://github.com/HW-SWeL/BMUSE/releases/tag/0.5.2> . +<https://disprot.org/DP01454> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/Protein> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <http://purl.org/dc/terms/conformsTo> <https://bioschemas.org/profiles/Protein/0.11-RELEASE> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <https://schema.org/hasBioPolymerSequence> "MSTLFPSLFPRVTETLWFNLDRPCVEETELQQQEQQHQAWLQSIAEKDNNLVPIGKPASEHYDDEEEEDDEDDEDSEEDSEDDEDMQDMDEMNDYNESPDDGEVNEVDMEGNEQDQDQWMI" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <https://schema.org/hasSequenceAnnotation> <https://disprot.org/DP01454#disorder-content> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <https://schema.org/hasSequenceAnnotation> <https://disprot.org/DP01454r001> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <https://schema.org/identifier> "https://identifiers.org/disprot:DP01454" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <https://schema.org/includedInDataset> "https://disprot.org/#2021-08" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <https://schema.org/name> "Anaphase-promoting complex subunit 15" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <https://schema.org/sameAs> <http://purl.uniprot.org/uniprot/P60006> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <https://schema.org/taxonomicRange> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/200140588> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/profiles/Protein/0.11-RELEASE> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/CreativeWork> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454#disorder-content> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/SequenceAnnotation> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454#disorder-content> <https://schema.org/additionalProperty> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/159543474> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454#disorder-content> <https://schema.org/sequenceLocation> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/77094838> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/159543474> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/PropertyValue> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/159543474> <https://schema.org/name> "Protein disorder content" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/159543474> <https://schema.org/propertyID> <https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00499> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/159543474> <https://schema.org/value> "5.371900826446281E-1" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/77094838> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/SequenceRange> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/77094838> <https://schema.org/rangeEnd> "121" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/77094838> <https://schema.org/rangeStart> "1" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454r001> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/SequenceAnnotation> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454r001> <https://schema.org/additionalProperty> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/1595402293> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454r001> <https://schema.org/sequenceLocation> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/1951282934> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454r001> <https://schema.org/subjectOf> <https://identifiers.org/pubmed:26083744> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/1595402293> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/PropertyValue> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/1595402293> <https://schema.org/name> "Term" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/1595402293> <https://schema.org/value> <https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00076> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00076> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/DefinedTerm> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00076> <https://schema.org/inDefinedTermSet> <https://disprot.org/assets/data/IDPO_v0.2.owl> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00076> <https://schema.org/name> "disorder" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/assets/data/IDPO_v0.2.owl#IDPO:00076> <https://schema.org/termCode> "IDPO:00076" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/assets/data/IDPO_v0.2.owl> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/DefinedTermSet> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/assets/data/IDPO_v0.2.owl> <https://schema.org/name> "IDP ontology" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/1951282934> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/SequenceRange> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/1951282934> <https://schema.org/rangeEnd> "121" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/1951282934> <https://schema.org/rangeStart> "57" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://identifiers.org/pubmed:26083744> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/ScholarlyArticle> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/200140588> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/DefinedTerm> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/200140588> <https://schema.org/inDefinedTermSet> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/827138196> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/200140588> <https://schema.org/sameAs> <http://purl.uniprot.org/taxonomy/9606> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/200140588> <https://schema.org/sameAs> <https://identifiers.org/taxonomy:9606> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/200140588> <https://schema.org/sameAs> <http://purl.obolibrary.org/obo/NCBITaxon_9606> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/200140588> <https://schema.org/termCode> "9606" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/200140588> <https://schema.org/url> <http://purl.bioontology.org/ontology/NCBITAXON/9606> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/827138196> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/DefinedTermSet> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/827138196> <https://schema.org/name> "NCBI taxon" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0/disprot.org/DP01454/827138196> <https://schema.org/url> <https://bioportal.bioontology.org/ontologies/NCBITAXON> <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . +<https://disprot.org/DP01454> <http://purl.org/dc/terms/title> "DisProt" <https://bioschemas.org/crawl/v1/disprot/DP01454/20211125/0> . \ No newline at end of file diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 53d029467..db9608753 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -38,6 +38,8 @@ <module>dhp-usage-raw-data-update</module> <module>dhp-broker-events</module> <module>dhp-doiboost</module> + <module>dhp-bmuse</module> + <module>dhp-rdfconverter</module> </modules> <pluginRepositories> diff --git a/pom.xml b/pom.xml index 9e9bbaa16..e0ce76b03 100644 --- a/pom.xml +++ b/pom.xml @@ -105,6 +105,18 @@ <enabled>false</enabled> </snapshots> </repository> + <repository> + <id>dnet-deps</id> + <name>D-Net Dependencies</name> + <url>https://maven.d4science.org/nexus/content/repositories/dnet-deps/</url> + <releases> + <enabled>true</enabled> + </releases> + <snapshots> + <enabled>false</enabled> + </snapshots> + <layout>default</layout> + </repository> </repositories> <dependencies>