diff --git a/dhp-workflows/dhp-bmuse/pom.xml b/dhp-workflows/dhp-bmuse/pom.xml
new file mode 100644
index 000000000..b34a0934b
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/pom.xml
@@ -0,0 +1,42 @@
+
+
+ 4.0.0
+
+ eu.dnetlib.dhp
+ dhp-workflows
+ 1.2.4-SNAPSHOT
+
+ dhp-bmuse
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ hwu.elixir
+ bmuse-core
+ 0.5.3
+
+
+ com.google.guava
+ guava
+ 22.0
+
+
+ com.squareup.okhttp3
+ okhttp
+ 3.11.0
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java
new file mode 100644
index 000000000..28ad1ee0a
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/bioschema/ScrapingJob.java
@@ -0,0 +1,112 @@
+
+package eu.dnetlib.dhp.bmuse.bioschema;
+
+import java.nio.charset.Charset;
+import java.text.SimpleDateFormat;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Stream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.util.LongAccumulator;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.bmuse.utils.BMUSEScraper;
+import eu.dnetlib.dhp.bmuse.utils.UrlParser;
+import eu.dnetlib.dhp.utils.DHPUtils;
+
+public class ScrapingJob {
+
+ static Logger logger = LoggerFactory.getLogger(ScrapingJob.class);
+ private static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z");
+
+ public static void main(String[] args) throws Exception {
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ ScrapingJob.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json")));
+ parser.parseArgument(args);
+
+ final String nameNode = parser.get("nameNode");
+ final String workingPath = parser.get("workingPath");
+ final String rdfOutput = parser.get("rdfOutput");
+ final String sitemapUrl = parser.get("sitemapUrl");
+ final String sitemapURLKey = parser.get("sitemapURLKey");
+ final String dynamic = parser.get("dynamic");
+ final String maxScrapedPages = parser.get("maxScrapedPages");
+ Boolean dynamicValue = true;
+ if (Objects.nonNull(dynamic)) {
+ dynamicValue = Boolean.parseBoolean(dynamic);
+ }
+ final boolean scrapingType = dynamicValue.booleanValue();
+
+ AtomicLong scraped = new AtomicLong(0l);
+ AtomicLong errors = new AtomicLong(0l);
+
+ logger
+ .info(
+ "*************************** STARTING SCRAPE: "
+ + formatter.format(new Date(System.currentTimeMillis())));
+ logger.info("Default charset: " + Charset.defaultCharset());
+
+ BMUSEScraper scraper = new BMUSEScraper();
+ String url = sitemapUrl.toLowerCase();
+ Elements urls = UrlParser.getSitemapList(url, sitemapURLKey);
+ long total = urls.size();
+
+ Path output = new Path(
+ nameNode
+ .concat(workingPath)
+ .concat(rdfOutput));
+ Configuration conf = DHPUtils.getHadoopConfiguration(nameNode);
+ try (SequenceFile.Writer writer = SequenceFile
+ .createWriter(
+ conf,
+ SequenceFile.Writer.file(output),
+ SequenceFile.Writer.keyClass(Text.class),
+ SequenceFile.Writer.valueClass(Text.class),
+ SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
+ Stream urlStream = null;
+ if (Objects.nonNull(maxScrapedPages)) {
+ urlStream = urls.stream().limit(Long.parseLong(maxScrapedPages));
+ } else {
+ urlStream = urls.stream();
+ }
+ urlStream.forEach(u -> {
+ try {
+ final Text key = new Text(u.text());
+ final Text value = new Text(scraper.scrapeUrl(u.text(), scrapingType));
+ writer.append(key, value);
+ scraped.getAndIncrement();
+ } catch (Exception e) {
+ logger.error(u.text(), e);
+ errors.getAndIncrement();
+ }
+ });
+ }
+
+ logger
+ .info(
+ "*************************** ENDING SCRAPE: " + formatter.format(new Date(System.currentTimeMillis())));
+ logger
+ .info(
+ "Total pages to scrape: " + total + " Scraped: " + scraped.get() +
+ " Errors: " + errors.get());
+ }
+}
diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java
new file mode 100644
index 000000000..1e58503fa
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/BMUSEScraper.java
@@ -0,0 +1,85 @@
+
+package eu.dnetlib.dhp.bmuse.utils;
+
+import java.io.StringWriter;
+
+import org.apache.any23.source.DocumentSource;
+import org.apache.any23.source.StringDocumentSource;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Model;
+import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
+import org.eclipse.rdf4j.rio.RDFFormat;
+import org.eclipse.rdf4j.rio.Rio;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import hwu.elixir.scrape.exceptions.*;
+import hwu.elixir.scrape.scraper.ScraperFilteredCore;
+
+public class BMUSEScraper extends ScraperFilteredCore {
+
+ private static final Logger logger = LoggerFactory.getLogger(BMUSEScraper.class.getName());
+
+ public String scrapeUrl(String url, Boolean dynamic)
+ throws MissingMarkupException, FourZeroFourException {
+ url = fixURL(url);
+
+ String html = "";
+ // The dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information
+ // (dynamic and static respectively)
+
+ if (dynamic) {
+ html = wrapHTMLExtraction(url);
+ } else {
+ html = wrapHTMLExtractionStatic(url);
+ }
+
+ if (html == null || html.contentEquals(""))
+ return new String("empty html");
+ if (logger.isTraceEnabled()) {
+ logger.trace("Read following html ==============================================================");
+ logger.trace(html);
+ }
+
+ try {
+ html = injectId(html, url);
+ if (logger.isTraceEnabled()) {
+ logger
+ .trace(
+ "Same HTML after injecting ID ==============================================================");
+ logger.trace(html);
+ }
+ } catch (MissingHTMLException | JsonLDInspectionException e) {
+ logger.error(e.toString());
+ return e.getMessage();
+ }
+
+ DocumentSource source = new StringDocumentSource(html, url);
+ IRI sourceIRI = SimpleValueFactory.getInstance().createIRI(source.getDocumentIRI());
+
+ String n3 = getTriplesInNTriples(source);
+ if (n3 == null)
+ throw new MissingMarkupException(url);
+
+ Model updatedModel = null;
+ try {
+ updatedModel = processTriples(n3, sourceIRI, 0l);
+ } catch (NTriplesParsingException e1) {
+ logger
+ .error(
+ "Failed to process triples into model; the NTriples generated from the URL (" + url
+ + ") could not be parsed into a model.");
+ return e1.getMessage();
+ }
+ if (updatedModel == null)
+ return new String("rdf model null");
+
+ try (StringWriter jsonLDWriter = new StringWriter()) {
+ Rio.write(updatedModel, jsonLDWriter, RDFFormat.NQUADS);
+ return jsonLDWriter.toString();
+ } catch (Exception e) {
+ logger.error("Problem writing jsonld for " + url, e);
+ return e.getMessage();
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java
new file mode 100644
index 000000000..c1c626b69
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/main/java/eu/dnetlib/dhp/bmuse/utils/UrlParser.java
@@ -0,0 +1,65 @@
+
+package eu.dnetlib.dhp.bmuse.utils;
+
+import java.io.IOException;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import hwu.elixir.utils.Helpers;
+
+public class UrlParser {
+
+ private static final Logger logger = LoggerFactory.getLogger(UrlParser.class.getName());
+
+ public static Elements getSitemapList(String url, String sitemapURLKey) throws IOException {
+
+ Document doc = new Document(url);
+ Document urlSitemapListsNested;
+ Elements elements = new Elements();
+ Elements sitemaps = new Elements();
+ boolean sitemapindex = false;
+ boolean urlset = false;
+
+ try {
+ int urlLength = url.length();
+ logger.info("parse sitemap list");
+ String sitemapExt = url.substring(urlLength - 3, urlLength);
+ if (sitemapExt.equalsIgnoreCase(".gz")) { // this checks only the extension at the ending
+ logger.info("compressed sitemap");
+ byte[] bytes = Jsoup.connect(url).ignoreContentType(true).execute().bodyAsBytes();
+ doc = Helpers.gzipFileDecompression(bytes);
+ } else {
+ doc = Jsoup.connect(url).maxBodySize(0).get();
+ }
+
+ } catch (IOException e) {
+ logger.error("Jsoup parsing exception: " + e.getMessage());
+ }
+
+ try {
+
+ elements = doc.select(sitemapURLKey);
+
+ // check the html if it is a sitemapindex or a urlset
+ sitemapindex = doc.outerHtml().contains("sitemapindex");
+ urlset = doc.outerHtml().contains("urlset");
+ } catch (NullPointerException e) {
+ logger.error(e.getMessage());
+ }
+
+ if (sitemapindex) {
+ // if sitemapindex get the loc of all the sitemaps
+ // added warning for sitemap index files
+ logger
+ .warn(
+ "please note this is a sitemapindex file which is not currently supported, please use the content (url) of the urlset instead");
+ sitemaps = doc.select(sitemapURLKey);
+ }
+
+ return elements;
+ }
+}
diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json
new file mode 100644
index 000000000..1ac0b50de
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/generate_dataset.json
@@ -0,0 +1,44 @@
+[
+ {
+ "paramName": "n",
+ "paramLongName": "nameNode",
+ "paramDescription": "the Name Node URI",
+ "paramRequired": true
+ },
+ {
+ "paramName": "w",
+ "paramLongName": "workingPath",
+ "paramDescription": "the working path",
+ "paramRequired": true
+ },
+ {
+ "paramName": "r",
+ "paramLongName": "rdfOutput",
+ "paramDescription": "the working path",
+ "paramRequired": true
+ },
+ {
+ "paramName": "u",
+ "paramLongName": "sitemapUrl",
+ "paramDescription": "the sitemap url",
+ "paramRequired": true
+ },
+ {
+ "paramName": "k",
+ "paramLongName": "sitemapURLKey",
+ "paramDescription": "the sitemap file contains a list of xml entries, each one has a tag identified with sitemapURLKey with the url as value",
+ "paramRequired": true
+ },
+ {
+ "paramName": "d",
+ "paramLongName": "dynamic",
+ "paramDescription": "the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)",
+ "paramRequired": false
+ },
+ {
+ "paramName": "m",
+ "paramLongName": "maxScrapedPages",
+ "paramDescription": "max number of pages that will be scraped, default: no limit",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml
new file mode 100644
index 000000000..7b13aab55
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/config-default.xml
@@ -0,0 +1,68 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ jobTracker
+ yarn
+
+
+ nameNode
+ hdfs://hadoop-rm1.garr-pa1.d4science.org:8020
+
+
+ hive_metastore_uris
+ thrift://hadoop-edge3.garr-pa1.d4science.org:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://hadoop-rm2.garr-pa1.d4science.org:19888
+
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ "com.cloudera.spark.lineage.NavigatorAppListener"
+
+
+ spark2SqlQueryExecutionListeners
+ "com.cloudera.spark.lineage.NavigatorQueryListener"
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml
new file mode 100644
index 000000000..705396653
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/eu/dnetlib/dhp/bmuse/bioschema/oozie_app/workflow.xml
@@ -0,0 +1,98 @@
+
+
+
+ workingPath
+ /data/bioschema/disprot/
+ the working path
+
+
+ rdfOutput
+ nquads.seq
+ rdf output of scraping step
+
+
+ sitemapUrl
+ https://disprot.org/sitemap2.xml.gz
+
+
+ sitemapURLKey
+ loc
+
+
+ dynamic
+ true
+ the dynamic boolean determines if the scraper should start using selenium or JSOUP to scrape the information (dynamic and static respectively)
+
+
+ maxScrapedPages
+ 100
+ max number of pages that will be scraped, default: no limit
+
+
+ oozie.launcher.mapreduce.map.java.opts
+ -Xmx4g
+
+
+ spark2RdfConversionMaxExecutors
+ 50
+
+
+ sparkDriverMemory
+ 7G
+ memory for driver process
+
+
+ sparkExecutorMemory
+ 2G
+ memory for individual executor
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+ spark 2.* extra listeners classname
+
+
+ spark2YarnHistoryServerAddress
+ spark 2.* yarn history server address
+
+
+ spark2EventLogDir
+ spark 2.* event log dir location
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+ ${nameNode}
+ eu.dnetlib.dhp.bmuse.bioschema.ScrapingJob
+ --nameNode${nameNode}
+ --workingPath${workingPath}
+ --rdfOutput${rdfOutput}
+ --sitemapUrl${sitemapUrl}
+ --sitemapURLKey${sitemapURLKey}
+ --dynamic${dynamic}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties b/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties
new file mode 100644
index 000000000..26f94f2df
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/localconfig.properties
@@ -0,0 +1,4 @@
+maxLimitScrape=200000
+schemaContext=https\://schema.org/docs/jsonldcontext.jsonld
+dynamic=true
+chromiumDriverLocation=/bin/chromedriver
\ No newline at end of file
diff --git a/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties b/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties
new file mode 100644
index 000000000..63cba917e
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/main/resources/log4j.properties
@@ -0,0 +1,9 @@
+# Set root logger level to DEBUG and its only appender to A1.
+log4j.rootLogger=INFO, A1
+
+# A1 is set to be a ConsoleAppender.
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
diff --git a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java
new file mode 100644
index 000000000..2b87d069a
--- /dev/null
+++ b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/SitemapTest.java
@@ -0,0 +1,24 @@
+
+package eu.dnetlib.dhp.bmuse.bioschema;
+
+import org.jsoup.select.Elements;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.bmuse.utils.UrlParser;
+
+public class SitemapTest {
+
+ static Logger logger = LoggerFactory.getLogger(SitemapTest.class);
+
+ @Test
+ @Disabled
+ void sitemapGzTest() throws Exception {
+ Elements urls = UrlParser.getSitemapList("https://disprot.org/sitemap2.xml.gz", "loc");
+ urls.forEach(url -> {
+ logger.info(url.text());
+ });
+ }
+}
diff --git a/dhp-workflows/dhp-rdfconverter/pom.xml b/dhp-workflows/dhp-rdfconverter/pom.xml
new file mode 100644
index 000000000..77d48a1e9
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/pom.xml
@@ -0,0 +1,42 @@
+
+
+ 4.0.0
+
+ eu.dnetlib.dhp
+ dhp-workflows
+ 1.2.4-SNAPSHOT
+
+ dhp-rdfconverter
+
+
+
+ org.apache.spark
+ spark-core_2.11
+
+
+ org.apache.spark
+ spark-sql_2.11
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ ${project.version}
+
+
+ org.apache.any23
+ apache-any23-core
+ 2.3
+
+
+ org.eclipse.rdf4j
+ rdf4j-rio-rdfxml
+ 2.5.4
+
+
+ org.eclipse.rdf4j
+ rdf4j-model
+ 2.5.4
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/SparkRdfToDatacite.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/SparkRdfToDatacite.java
new file mode 100644
index 000000000..057492dab
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/SparkRdfToDatacite.java
@@ -0,0 +1,71 @@
+
+package eu.dnetlib.dhp.rdfconverter.bioschema;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.Optional;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.rdfconverter.utils.RDFConverter;
+
+public class SparkRdfToDatacite {
+
+ static Logger logger = LoggerFactory.getLogger(SparkRdfToDatacite.class);
+
+ public static void main(String[] args) throws Exception {
+
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ SparkRdfToDatacite.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/rdfconverter/bioschema/generate_dataset.json")));
+ parser.parseArgument(args);
+ Boolean isSparkSessionManaged = Optional
+ .ofNullable(parser.get("isSparkSessionManaged"))
+ .map(Boolean::valueOf)
+ .orElse(Boolean.TRUE);
+ final String workingPath = parser.get("workingPath");
+ final String rdfNquadsRecords = parser.get("rdfInput");
+ final String output = parser.get("output");
+
+ SparkConf conf = new SparkConf();
+ runWithSparkSession(
+ conf,
+ isSparkSessionManaged,
+ spark -> {
+ JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+ String rdfNquadsRecordsPath = workingPath.concat(rdfNquadsRecords);
+ JavaPairRDD rdfNquadsRecordsRDD = sc
+ .sequenceFile(rdfNquadsRecordsPath, Text.class, Text.class);
+ logger.info("Rdf nquads records retrieved: {}", rdfNquadsRecordsRDD.count());
+
+ JavaRDD proteins = rdfNquadsRecordsRDD.flatMap(nquads -> {
+ RDFConverter converter = new RDFConverter();
+ ArrayList jsonlds = null;
+ try {
+ jsonlds = converter.nQuadsFile2DataciteJson(nquads._2().toString());
+ } catch (Exception e) {
+ logger.error(nquads._1().toString(), e);
+ return Arrays.asList(new String()).iterator();
+ }
+ return jsonlds.iterator();
+ }).filter(Objects::nonNull).filter(jsonld -> !jsonld.isEmpty()).map(jsonld -> new Text(jsonld));
+ logger.info("json datacite generated: {}", proteins.count());
+ proteins.saveAsTextFile(workingPath.concat(output), GzipCodec.class);
+ });
+ }
+}
diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java
new file mode 100644
index 000000000..ffac36459
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java
@@ -0,0 +1,384 @@
+
+package eu.dnetlib.dhp.rdfconverter.bioschema.model;
+
+import java.util.List;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class BioSchemaProtein {
+ @JsonProperty("@id")
+ private String id;
+ @JsonProperty("@graph")
+ private List entryList;
+ @JsonProperty("http://purl.org/pav/retrievedOn")
+ private DateTimeType retrievedOn;
+ @JsonProperty("citation")
+ private Citation citation;
+
+ public static class Entry {
+ @JsonProperty("@id")
+ private String id;
+ @JsonProperty("@type")
+ private List type;
+ @JsonProperty("https://schema.org/identifier")
+ private String identifier;
+ @JsonProperty("https://schema.org/name")
+ private String name;
+ @JsonProperty("associatedDisease")
+ private List associatedDisease;
+ @JsonProperty("description")
+ private String description;
+ @JsonProperty("isEncodedByBioChemEntity")
+ private String isEncodedByBioChemEntity;
+ @JsonProperty("url")
+ private String url;
+ @JsonProperty("alternateName")
+ private String alternateName;
+ @JsonProperty("bioChemInteraction")
+ private List bioChemInteraction;
+ @JsonProperty("bioChemSimilarity")
+ private List bioChemSimilarity;
+ @JsonProperty("hasMolecularFunction")
+ private String hasMolecularFunction;
+ @JsonProperty("image")
+ private String image;
+ @JsonProperty("isInvolvedInBiologicalProcess")
+ private String isInvolvedInBiologicalProcess;
+ @JsonProperty("isPartOfBioChemEntity")
+ private IsPartOfBioChemEntity isPartOfBioChemEntity;
+ @JsonProperty("mainEntityOfPage")
+ private Link mainEntityOfPage;
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public List getType() {
+ return type;
+ }
+
+ public void setType(List type) {
+ this.type = type;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ @JsonProperty("https://schema.org/sameAs")
+ private List sameAs;
+
+ public List getSameAs() {
+ return sameAs;
+ }
+
+ public void setSameAs(List sameAs) {
+ this.sameAs = sameAs;
+ }
+
+ public String getIdentifier() {
+ return identifier;
+ }
+
+ public void setIdentifier(String identifier) {
+ this.identifier = identifier;
+ }
+
+ public String getDescription() {
+ return description;
+ }
+
+ public void setDescription(String description) {
+ this.description = description;
+ }
+
+ public String getIsEncodedByBioChemEntity() {
+ return isEncodedByBioChemEntity;
+ }
+
+ public void setIsEncodedByBioChemEntity(String isEncodedByBioChemEntity) {
+ this.isEncodedByBioChemEntity = isEncodedByBioChemEntity;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public void setUrl(String url) {
+ this.url = url;
+ }
+
+ public String getAlternateName() {
+ return alternateName;
+ }
+
+ public void setAlternateName(String alternateName) {
+ this.alternateName = alternateName;
+ }
+
+ public List getBioChemInteraction() {
+ return bioChemInteraction;
+ }
+
+ public void setBioChemInteraction(List bioChemInteraction) {
+ this.bioChemInteraction = bioChemInteraction;
+ }
+
+ public List getBioChemSimilarity() {
+ return bioChemSimilarity;
+ }
+
+ public void setBioChemSimilarity(List bioChemSimilarity) {
+ this.bioChemSimilarity = bioChemSimilarity;
+ }
+
+ public String getHasMolecularFunction() {
+ return hasMolecularFunction;
+ }
+
+ public void setHasMolecularFunction(String hasMolecularFunction) {
+ this.hasMolecularFunction = hasMolecularFunction;
+ }
+
+ public String getImage() {
+ return image;
+ }
+
+ public void setImage(String image) {
+ this.image = image;
+ }
+
+ public String getIsInvolvedInBiologicalProcess() {
+ return isInvolvedInBiologicalProcess;
+ }
+
+ public void setIsInvolvedInBiologicalProcess(String isInvolvedInBiologicalProcess) {
+ this.isInvolvedInBiologicalProcess = isInvolvedInBiologicalProcess;
+ }
+
+ public List getAssociatedDisease() {
+ return associatedDisease;
+ }
+
+ public void setAssociatedDisease(List associatedDisease) {
+ this.associatedDisease = associatedDisease;
+ }
+
+ public IsPartOfBioChemEntity getIsPartOfBioChemEntity() {
+ return isPartOfBioChemEntity;
+ }
+
+ public void setIsPartOfBioChemEntity(IsPartOfBioChemEntity isPartOfBioChemEntity) {
+ this.isPartOfBioChemEntity = isPartOfBioChemEntity;
+ }
+
+ public Link getMainEntityOfPage() {
+ return mainEntityOfPage;
+ }
+
+ public void setMainEntityOfPage(Link mainEntityOfPage) {
+ this.mainEntityOfPage = mainEntityOfPage;
+ }
+
+ }
+
+ public static class IsPartOfBioChemEntity {
+ @JsonProperty("@type")
+ private String type;
+ @JsonProperty("url")
+ private String url;
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public void setUrl(String url) {
+ this.url = url;
+ }
+ }
+
+ public static class AssociatedDisease {
+ @JsonProperty("@type")
+ private String type;
+ @JsonProperty("name")
+ private String name;
+ @JsonProperty("code")
+ private DeseaseCode code;
+ @JsonProperty("id")
+ private String id;
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public DeseaseCode getCode() {
+ return code;
+ }
+
+ public void setCode(DeseaseCode code) {
+ this.code = code;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+ }
+
+ public static class DeseaseCode {
+ @JsonProperty("@type")
+ private String type;
+ @JsonProperty("codeValue")
+ private String codeValue;
+ @JsonProperty("codingSystem")
+ private String codingSystem;
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public String getCodeValue() {
+ return codeValue;
+ }
+
+ public void setCodeValue(String codeValue) {
+ this.codeValue = codeValue;
+ }
+
+ public String getCodingSystem() {
+ return codingSystem;
+ }
+
+ public void setCodingSystem(String codingSystem) {
+ this.codingSystem = codingSystem;
+ }
+ }
+
+ public static class Link {
+ @JsonProperty("@id")
+ private String id;
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+ }
+
+ public static class DateTimeType {
+ @JsonProperty("@type")
+ private String type;
+ @JsonProperty("@value")
+ private String value;
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public void setValue(String value) {
+ this.value = value;
+ }
+ }
+
+ public static class Citation {
+ @JsonProperty("@type")
+ private String type;
+ @JsonProperty("@id")
+ private String id;
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public List getEntryList() {
+ return entryList;
+ }
+
+ public void setEntryList(List entryList) {
+ this.entryList = entryList;
+ }
+
+ public DateTimeType getRetrievedOn() {
+ return retrievedOn;
+ }
+
+ public void setRetrievedOn(DateTimeType retrievedOn) {
+ this.retrievedOn = retrievedOn;
+ }
+
+ public Citation getCitation() {
+ return citation;
+ }
+
+ public void setCitation(Citation citation) {
+ this.citation = citation;
+ }
+}
diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java
new file mode 100644
index 000000000..6fe0963e2
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java
@@ -0,0 +1,291 @@
+
+package eu.dnetlib.dhp.rdfconverter.bioschema.model;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class DataciteProtein {
+ private String id;
+ private String doi;
+ private Types types;
+ List creators = new ArrayList();
+ private String publisher;
+ private String publicationYear;
+ private static final String schemaVersion = "http://datacite.org/schema/kernel-4";
+ List identifiers = new ArrayList();
+ List relatedIdentifiers = new ArrayList();
+ List alternateIdentifiers = new ArrayList();
+ List descriptions = new ArrayList();
+ List titles = new ArrayList();
+ private List dates = new ArrayList();
+
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ public static class Types {
+ private String resourceType;
+ private String resourceTypeGeneral;
+
+ public String getResourceType() {
+ return resourceType;
+ }
+
+ public void setResourceType(String resourceType) {
+ this.resourceType = resourceType;
+ }
+
+ public String getResourceTypeGeneral() {
+ return resourceTypeGeneral;
+ }
+
+ public void setResourceTypeGeneral(String resourceTypeGeneral) {
+ this.resourceTypeGeneral = resourceTypeGeneral;
+ }
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ public static class Creators {
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ public static class Identifier {
+ private String identifier;
+ private String identifierType;
+
+ public String getIdentifier() {
+ return identifier;
+ }
+
+ public void setIdentifier(String identifier) {
+ this.identifier = identifier;
+ }
+
+ public String getIdentifierType() {
+ return identifierType;
+ }
+
+ public void setIdentifierType(String identifierType) {
+ this.identifierType = identifierType;
+ }
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ public static class RelatedIdentifier {
+ private String relationType;
+ private String relatedIdentifier;
+ private String relatedIdentifierType;
+
+ public String getRelationType() {
+ return relationType;
+ }
+
+ public void setRelationType(String relationType) {
+ this.relationType = relationType;
+ }
+
+ public String getRelatedIdentifier() {
+ return relatedIdentifier;
+ }
+
+ public void setRelatedIdentifier(String relatedIdentifier) {
+ this.relatedIdentifier = relatedIdentifier;
+ }
+
+ public String getRelatedIdentifierType() {
+ return relatedIdentifierType;
+ }
+
+ public void setRelatedIdentifierType(String relatedIdentifierType) {
+ this.relatedIdentifierType = relatedIdentifierType;
+ }
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ public static class AlternateIdentifier {
+ private String alternateIdentifier;
+ private String alternateIdentifierType;
+
+ public String getAlternateIdentifier() {
+ return alternateIdentifier;
+ }
+
+ public void setAlternateIdentifier(String alternateIdentifier) {
+ this.alternateIdentifier = alternateIdentifier;
+ }
+
+ public String getAlternateIdentifierType() {
+ return alternateIdentifierType;
+ }
+
+ public void setAlternateIdentifierType(String alternateIdentifierType) {
+ this.alternateIdentifierType = alternateIdentifierType;
+ }
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ public static class Description {
+ private String description;
+ private String descriptionType;
+
+ public String getDescription() {
+ return description;
+ }
+
+ public void setDescription(String description) {
+ this.description = description;
+ }
+
+ public String getDescriptionType() {
+ return descriptionType;
+ }
+
+ public void setDescriptionType(String descriptionType) {
+ this.descriptionType = descriptionType;
+ }
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ public static class Title {
+ private String title;
+ private String titleType;
+
+ public String getTitle() {
+ return title;
+ }
+
+ public void setTitle(String title) {
+ this.title = title;
+ }
+
+ public String getTitleType() {
+ return titleType;
+ }
+
+ public void setTitleType(String titleType) {
+ this.titleType = titleType;
+ }
+ }
+
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ public static class DataciteDate {
+ private String date;
+ private String dateType;
+
+ public String getDate() {
+ return date;
+ }
+
+ public void setDate(String date) {
+ this.date = date;
+ }
+
+ public String getDateType() {
+ return dateType;
+ }
+
+ public void setDateType(String dateType) {
+ this.dateType = dateType;
+ }
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public String getDoi() {
+ return doi;
+ }
+
+ public void setDoi(String doi) {
+ this.doi = doi;
+ }
+
+ public Types getTypes() {
+ return types;
+ }
+
+ public void setTypes(Types types) {
+ this.types = types;
+ }
+
+ public List getCreators() {
+ return creators;
+ }
+
+ public void setCreators(List creators) {
+ this.creators = creators;
+ }
+
+ public String getPublisher() {
+ return publisher;
+ }
+
+ public void setPublisher(String publisher) {
+ this.publisher = publisher;
+ }
+
+ public String getPublicationYear() {
+ return publicationYear;
+ }
+
+ public void setPublicationYear(String publicationYear) {
+ this.publicationYear = publicationYear;
+ }
+
+ public static String getSchemaVersion() {
+ return schemaVersion;
+ }
+
+ public List getRelatedIdentifiers() {
+ return relatedIdentifiers;
+ }
+
+ public void setRelatedIdentifiers(List relatedIdentifiers) {
+ this.relatedIdentifiers = relatedIdentifiers;
+ }
+
+ public List getAlternateIdentifiers() {
+ return alternateIdentifiers;
+ }
+
+ public void setAlternateIdentifiers(List alternateIdentifiers) {
+ this.alternateIdentifiers = alternateIdentifiers;
+ }
+
+ public List getDescriptions() {
+ return descriptions;
+ }
+
+ public void setDescriptions(List descriptions) {
+ this.descriptions = descriptions;
+ }
+
+ public List getTitles() {
+ return titles;
+ }
+
+ public void setTitles(List titles) {
+ this.titles = titles;
+ }
+
+ public List getIdentifiers() {
+ return identifiers;
+ }
+
+ public void setIdentifiers(List identifiers) {
+ this.identifiers = identifiers;
+ }
+
+ public List getDates() {
+ return dates;
+ }
+
+ public void setDates(List dates) {
+ this.dates = dates;
+ }
+}
diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java
new file mode 100644
index 000000000..08e4a03ad
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java
@@ -0,0 +1,197 @@
+
+package eu.dnetlib.dhp.rdfconverter.utils;
+
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.*;
+
+import org.eclipse.rdf4j.model.Model;
+import org.eclipse.rdf4j.rio.RDFFormat;
+import org.eclipse.rdf4j.rio.RDFWriter;
+import org.eclipse.rdf4j.rio.Rio;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.github.jsonldjava.core.JsonLdOptions;
+import com.github.jsonldjava.core.JsonLdProcessor;
+import com.github.jsonldjava.utils.JsonUtils;
+
+import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
+import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein;
+
+public class RDFConverter {
+
+ private static final Logger log = LoggerFactory.getLogger(RDFConverter.class);
+
+ public ArrayList nQuadsFile2DataciteJson(String nquads) throws Exception {
+ StringReader reader = new StringReader(nquads);
+ Model model = Rio.parse(reader, "", RDFFormat.NQUADS);
+ StringWriter jsonLDWriter = new StringWriter();
+ RDFWriter rdfRecordWriter = Rio.createWriter(RDFFormat.JSONLD, jsonLDWriter);
+ Rio.write(model, rdfRecordWriter);
+ String jsonLDBuffer = jsonLDWriter.toString();
+ Object jsonObject = JsonUtils.fromString(jsonLDBuffer);
+ Object compact = JsonLdProcessor.compact(jsonObject, new HashMap<>(), new JsonLdOptions());
+ String compactContent = JsonUtils.toString(compact);
+
+ ObjectMapper objectMapper = new ObjectMapper();
+ objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
+ objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
+ objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+ BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
+ log.debug("BioSchema id: " + bioSchemaProtein.getId());
+ BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
+ BioSchemaProtein.Citation citation = bioSchemaProtein.getCitation();
+
+ ArrayList results = new ArrayList();
+ bioSchemaProtein.getEntryList().stream().forEach(entry -> {
+
+ if (entry.getType() != null
+ && entry.getType().stream().filter(type -> type.equals("https://schema.org/Protein")).count() == 1) {
+
+ DataciteProtein dataciteProtein = new DataciteProtein();
+ if (citation != null) {
+ addRelatedIdentifier(dataciteProtein, citation.getId(), "CitedBy");
+ }
+
+ DataciteProtein.Types types = new DataciteProtein.Types();
+ types.setResourceType("Protein");
+ types.setResourceTypeGeneral("Dataset");
+ dataciteProtein.setTypes(types);
+
+ DataciteProtein.DataciteDate dataciteDate = new DataciteProtein.DataciteDate();
+ dataciteDate.setDate(retrievedOnType.getValue());
+ dataciteDate.setDateType("Collected");
+ dataciteProtein.getDates().add(dataciteDate);
+
+ if (entry.getName() != null) {
+ log.debug("Name: " + entry.getName());
+ DataciteProtein.Title title = new DataciteProtein.Title();
+ title.setTitle(entry.getName());
+ dataciteProtein.getTitles().add(title);
+ }
+ DataciteProtein.Identifier identifier = new DataciteProtein.Identifier();
+ log.debug("Id: " + entry.getId());
+ identifier.setIdentifier(entry.getId());
+ identifier.setIdentifierType("URL");
+ dataciteProtein.getIdentifiers().add(identifier);
+
+ if (entry.getIdentifier() != null) {
+ log.debug("Identifier: " + entry.getIdentifier());
+ addAlternateIdentifier(dataciteProtein, entry.getIdentifier());
+ }
+
+ if (entry.getDescription() != null) {
+ log.debug("description: " + entry.getDescription());
+ DataciteProtein.Description description = new DataciteProtein.Description();
+ description.setDescription(entry.getDescription());
+ dataciteProtein.getDescriptions().add(description);
+ }
+
+ if (entry.getIsEncodedByBioChemEntity() != null) {
+ log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
+ addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
+ }
+
+ if (entry.getUrl() != null) {
+ log.debug("url: " + entry.getUrl());
+ addAlternateIdentifier(dataciteProtein, entry.getUrl());
+ }
+
+ if (entry.getAlternateName() != null) {
+ log.debug("alternateName: " + entry.getAlternateName());
+ DataciteProtein.Title title = new DataciteProtein.Title();
+ title.setTitle(entry.getAlternateName());
+ title.setTitleType("AlternativeTitle");
+ dataciteProtein.getTitles().add(title);
+ }
+
+ if (entry.getBioChemInteraction() != null) {
+ entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> {
+ log.debug("bioChemInteraction: " + bc.getId());
+ addRelatedIdentifier(dataciteProtein, bc.getId(), "");
+ });
+ }
+
+ if (entry.getBioChemSimilarity() != null) {
+ entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> {
+ log.debug("bioChemSimilarity: " + bc.getId());
+ addRelatedIdentifier(dataciteProtein, bc.getId(), "");
+ });
+ }
+
+ if (entry.getHasMolecularFunction() != null) {
+ log.debug("hasMolecularFunction: " + entry.getHasMolecularFunction());
+ addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), "");
+ }
+
+ if (entry.getIsInvolvedInBiologicalProcess() != null) {
+ log.debug("isInvolvedInBiologicalProcess: " + entry.getIsInvolvedInBiologicalProcess());
+ addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), "");
+ }
+
+ if (entry.getIsEncodedByBioChemEntity() != null) {
+ log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
+ addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
+ }
+
+ if (entry.getIsPartOfBioChemEntity() != null) {
+ log.debug("isPartOfBioChemEntity: " + entry.getIsPartOfBioChemEntity());
+ addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), "");
+ }
+
+ if (entry.getSameAs() != null) {
+ entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> {
+ log.debug("sameAs: " + sameAs.getId());
+ addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo");
+ });
+ }
+
+ if (entry.getAssociatedDisease() != null) {
+ entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> {
+ log.debug("associated disease: " + ad.getName());
+ addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo");
+ });
+ }
+
+ String proteinId = "";
+ try {
+ String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/");
+ proteinId = identifierParts[identifierParts.length - 1];
+ } catch (Exception e) {
+ log.error("Identifier not found", e.getMessage());
+ }
+
+ dataciteProtein.setId(proteinId);
+
+ ObjectMapper mapper = new ObjectMapper();
+ try {
+ StringWriter writer = new StringWriter();
+ mapper.writeValue(writer, dataciteProtein);
+ results.add(writer.toString());
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+ });
+ return results;
+ }
+
+ private void addRelatedIdentifier(DataciteProtein DataciteProtein, String relatedIdentifierValue,
+ String relationType) {
+ DataciteProtein.RelatedIdentifier relatedIdentifier = new DataciteProtein.RelatedIdentifier();
+ relatedIdentifier.setRelatedIdentifier(relatedIdentifierValue);
+ if (!relationType.isEmpty()) {
+ relatedIdentifier.setRelationType(relationType);
+ }
+ DataciteProtein.getRelatedIdentifiers().add(relatedIdentifier);
+ }
+
+ private void addAlternateIdentifier(DataciteProtein DataciteProtein, String alternateIdentifierValue) {
+ DataciteProtein.AlternateIdentifier alternateIdentifier = new DataciteProtein.AlternateIdentifier();
+ alternateIdentifier.setAlternateIdentifier(alternateIdentifierValue);
+ DataciteProtein.getAlternateIdentifiers().add(alternateIdentifier);
+ }
+}
diff --git a/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/generate_dataset.json b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/generate_dataset.json
new file mode 100644
index 000000000..4038c275f
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/generate_dataset.json
@@ -0,0 +1,26 @@
+[
+ {
+ "paramName": "n",
+ "paramLongName": "nameNode",
+ "paramDescription": "the Name Node URI",
+ "paramRequired": true
+ },
+ {
+ "paramName": "w",
+ "paramLongName": "workingPath",
+ "paramDescription": "the working path",
+ "paramRequired": true
+ },
+ {
+ "paramName": "i",
+ "paramLongName": "rdfInput",
+ "paramDescription": "sequence file inside working path that contains rdf records",
+ "paramRequired": true
+ },
+ {
+ "paramName": "o",
+ "paramLongName": "output",
+ "paramDescription": "relative path inside workingpath where bioschema dataset in datacite format will be stored",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/config-default.xml b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/config-default.xml
new file mode 100644
index 000000000..7b13aab55
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/config-default.xml
@@ -0,0 +1,68 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ jobTracker
+ yarn
+
+
+ nameNode
+ hdfs://hadoop-rm1.garr-pa1.d4science.org:8020
+
+
+ hive_metastore_uris
+ thrift://hadoop-edge3.garr-pa1.d4science.org:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://hadoop-rm2.garr-pa1.d4science.org:19888
+
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ "com.cloudera.spark.lineage.NavigatorAppListener"
+
+
+ spark2SqlQueryExecutionListeners
+ "com.cloudera.spark.lineage.NavigatorQueryListener"
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/workflow.xml b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/workflow.xml
new file mode 100644
index 000000000..0a7a2495f
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/eu/dnetlib/dhp/rdfconverter/bioschema/oozie_app/workflow.xml
@@ -0,0 +1,94 @@
+
+
+
+ workingPath
+ /data/bioschema/disprot/
+ the working path
+
+
+ rdfInput
+ nquads.seq
+ rdf output of scraping workflow
+
+
+ output
+ json-datacite/
+
+
+ oozie.launcher.mapreduce.map.java.opts
+ -Xmx4g
+
+
+ spark2RdfConversionMaxExecutors
+ 50
+
+
+ sparkDriverMemory
+ 7G
+ memory for driver process
+
+
+ sparkExecutorMemory
+ 2G
+ memory for individual executor
+
+
+ spark2ExtraListeners
+ com.cloudera.spark.lineage.NavigatorAppListener
+ spark 2.* extra listeners classname
+
+
+ spark2YarnHistoryServerAddress
+ spark 2.* yarn history server address
+
+
+ spark2EventLogDir
+ spark 2.* event log dir location
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ NquadsToDataciteJson
+ eu.dnetlib.dhp.rdfconverter.bioschema.SparkRdfToDatacite
+ dhp-rdfconverter-${projectVersion}.jar
+
+ --conf spark.dynamicAllocation.enabled=true
+ --conf spark.dynamicAllocation.maxExecutors=${spark2RdfConversionMaxExecutors}
+ --executor-memory=${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --nameNode${nameNode}
+ --workingPath${workingPath}
+ --rdfInput${rdfInput}
+ --output${output}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-rdfconverter/src/main/resources/log4j.properties b/dhp-workflows/dhp-rdfconverter/src/main/resources/log4j.properties
new file mode 100644
index 000000000..63cba917e
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/main/resources/log4j.properties
@@ -0,0 +1,9 @@
+# Set root logger level to DEBUG and its only appender to A1.
+log4j.rootLogger=INFO, A1
+
+# A1 is set to be a ConsoleAppender.
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
diff --git a/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java
new file mode 100644
index 000000000..e74c17352
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java
@@ -0,0 +1,31 @@
+
+package eu.dnetlib.dhp.rdfconverter.bioschema;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.rdfconverter.utils.RDFConverter;
+
+public class ConverterTest {
+
+ static Logger logger = LoggerFactory.getLogger(ConverterTest.class);
+
+ @Test
+// @Disabled
+ public void nqToDataciteTest() throws Exception {
+ InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
+ String nq = IOUtils.toString(is);
+ logger.info("NQ: " + nq);
+ RDFConverter converter = new RDFConverter();
+ ArrayList results = converter.nQuadsFile2DataciteJson(nq);
+ if (results != null && !results.isEmpty()) {
+ logger.info("JSON DATACITE: " + results.get(0));
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq b/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq
new file mode 100644
index 000000000..f26a4b1d9
--- /dev/null
+++ b/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq
@@ -0,0 +1,52 @@
+ .
+ "2021-11-25T12:23:57"^^ .
+ .
+ .
+ .
+ "MSTLFPSLFPRVTETLWFNLDRPCVEETELQQQEQQHQAWLQSIAEKDNNLVPIGKPASEHYDDEEEEDDEDDEDSEEDSEDDEDMQDMDEMNDYNESPDDGEVNEVDMEGNEQDQDQWMI" .
+ .
+ .
+ "https://identifiers.org/disprot:DP01454" .
+ "https://disprot.org/#2021-08" .
+ "Anaphase-promoting complex subunit 15" .
+ .
+ .
+ .
+ .
+ .
+ .
+ .
+ "Protein disorder content" .
+ .
+ "5.371900826446281E-1" .
+ .
+ "121" .
+ "1" .
+ .
+ .
+ .
+ .
+ .
+ "Term" .
+ .
+ .
+ .
+ "disorder" .
+ "IDPO:00076" .
+ .
+ "IDP ontology" .
+ .
+ "121" .
+ "57" .
+ .
+ .
+ .
+ .
+ .
+ .
+ "9606" .
+ .
+ .
+ "NCBI taxon" .
+ .
+ "DisProt" .
\ No newline at end of file
diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml
index 53d029467..db9608753 100644
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@@ -38,6 +38,8 @@
dhp-usage-raw-data-update
dhp-broker-events
dhp-doiboost
+ dhp-bmuse
+ dhp-rdfconverter
diff --git a/pom.xml b/pom.xml
index 9e9bbaa16..e0ce76b03 100644
--- a/pom.xml
+++ b/pom.xml
@@ -105,6 +105,18 @@
false
+
+ dnet-deps
+ D-Net Dependencies
+ https://maven.d4science.org/nexus/content/repositories/dnet-deps/
+
+ true
+
+
+ false
+
+ default
+