collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
+ final String baseUrl = api.getBaseUrl();
+
+ final String resumptionType = api.getParams().get("resumptionType");
+ final String resumptionParam = api.getParams().get("resumptionParam");
+ final String resumptionXpath = api.getParams().get("resumptionXpath");
+ final String resultTotalXpath = api.getParams().get("resultTotalXpath");
+ final String resultFormatParam = api.getParams().get("resultFormatParam");
+ final String resultFormatValue = api.getParams().get("resultFormatValue");
+ final String resultSizeParam = api.getParams().get("resultSizeParam");
+ final String queryParams = api.getParams().get("queryParams");
+ final String entityXpath = api.getParams().get("entityXpath");
+ final String authMethod = api.getParams().get("authMethod");
+ final String authToken = api.getParams().get("authToken");
+ final String resultSizeValue = Optional
+ .ofNullable(api.getParams().get("resultSizeValue"))
+ .filter(StringUtils::isNotBlank)
+ .orElse(RESULT_SIZE_VALUE_DEFAULT);
+
+ if (StringUtils.isBlank(baseUrl)) {
+ throw new CollectorException("Param 'baseUrl' is null or empty");
+ }
+ if (StringUtils.isBlank(resumptionType)) {
+ throw new CollectorException("Param 'resumptionType' is null or empty");
+ }
+ if (StringUtils.isBlank(resumptionParam)) {
+ throw new CollectorException("Param 'resumptionParam' is null or empty");
+ }
+ if (StringUtils.isBlank(resultFormatValue)) {
+ throw new CollectorException("Param 'resultFormatValue' is null or empty");
+ }
+ if (StringUtils.isBlank(queryParams)) {
+ throw new CollectorException("Param 'queryParams' is null or empty");
+ }
+ if (StringUtils.isBlank(entityXpath)) {
+ throw new CollectorException("Param 'entityXpath' is null or empty");
+ }
+
+ final String resultOutputFormat = Optional
+ .ofNullable(api.getParams().get("resultOutputFormat"))
+ .map(String::toLowerCase)
+ .filter(StringUtils::isNotBlank)
+ .orElse(resultFormatValue.toLowerCase());
+
+ RestIterator it = new RestIterator(
+ getClientParams(),
+ baseUrl,
+ resumptionType,
+ resumptionParam,
+ resumptionXpath,
+ resultTotalXpath,
+ resultFormatParam,
+ resultFormatValue,
+ resultSizeParam,
+ resultSizeValue,
+ queryParams,
+ entityXpath,
+ authMethod,
+ authToken,
+ resultOutputFormat);
+
+ return StreamSupport
+ .stream(
+ Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
+ }
+
+ public HttpClientParams getClientParams() {
+ return clientParams;
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
new file mode 100644
index 0000000000..764c21fc21
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@@ -0,0 +1,411 @@
+
+package eu.dnetlib.dhp.collection.plugin.rest;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.Iterator;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import javax.xml.xpath.*;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.http.HttpHeaders;
+import org.apache.http.entity.ContentType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+import eu.dnetlib.dhp.collection.CollectorException;
+import eu.dnetlib.dhp.collection.HttpClientParams;
+import eu.dnetlib.dhp.collection.JsonUtils;
+
+/**
+ * log.info(...) equal to log.trace(...) in the application-logs
+ *
+ * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
+ *
+ * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
+ * @date 2020-04-09
+ *
+ */
+public class RestIterator implements Iterator {
+
+ private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
+ public static final String UTF_8 = "UTF-8";
+
+ private final HttpClientParams clientParams;
+
+ private final String BASIC = "basic";
+
+ private final JsonUtils jsonUtils;
+
+ private final String baseUrl;
+ private final String resumptionType;
+ private final String resumptionParam;
+ private final String resultFormatValue;
+ private String queryParams;
+ private final int resultSizeValue;
+ private int resumptionInt = 0; // integer resumption token (first record to harvest)
+ private int resultTotal = -1;
+ private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
+ // or token scanned from results)
+ private InputStream resultStream;
+ private Transformer transformer;
+ private XPath xpath;
+ private String query;
+ private XPathExpression xprResultTotalPath;
+ private XPathExpression xprResumptionPath;
+ private XPathExpression xprEntity;
+ private final String queryFormat;
+ private final String querySize;
+ private final String authMethod;
+ private final String authToken;
+ private final Queue recordQueue = new PriorityBlockingQueue();
+ private int discoverResultSize = 0;
+ private int pagination = 1;
+ /*
+ * While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in
+ * json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in
+ * json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
+ */
+ private final String resultOutputFormat;
+
+ /** RestIterator class
+ * compatible to version 1.3.33
+ */
+ public RestIterator(
+ final HttpClientParams clientParams,
+ final String baseUrl,
+ final String resumptionType,
+ final String resumptionParam,
+ final String resumptionXpath,
+ final String resultTotalXpath,
+ final String resultFormatParam,
+ final String resultFormatValue,
+ final String resultSizeParam,
+ final String resultSizeValueStr,
+ final String queryParams,
+ final String entityXpath,
+ final String authMethod,
+ final String authToken,
+ final String resultOutputFormat) {
+
+ this.clientParams = clientParams;
+ this.jsonUtils = new JsonUtils();
+ this.baseUrl = baseUrl;
+ this.resumptionType = resumptionType;
+ this.resumptionParam = resumptionParam;
+ this.resultFormatValue = resultFormatValue;
+ this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
+ this.queryParams = queryParams;
+ this.authMethod = authMethod;
+ this.authToken = authToken;
+ this.resultOutputFormat = resultOutputFormat;
+
+ queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+ : "";
+ querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
+
+ try {
+ initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
+ } catch (Exception e) {
+ throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
+ }
+ initQueue();
+ }
+
+ private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
+ throws TransformerConfigurationException, XPathExpressionException {
+ transformer = TransformerFactory.newInstance().newTransformer();
+ transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+ transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
+ xpath = XPathFactory.newInstance().newXPath();
+ xprResultTotalPath = xpath.compile(resultTotalXpath);
+ xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
+ xprEntity = xpath.compile(entityXpath);
+ }
+
+ private void initQueue() {
+ query = baseUrl + "?" + queryParams + querySize + queryFormat;
+ log.info("REST calls starting with " + query);
+ }
+
+ private void disconnect() {
+ // TODO close inputstream
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.util.Iterator#hasNext()
+ */
+ @Override
+ public boolean hasNext() {
+ if (recordQueue.isEmpty() && query.isEmpty()) {
+ disconnect();
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.util.Iterator#next()
+ */
+ @Override
+ public String next() {
+ synchronized (recordQueue) {
+ while (recordQueue.isEmpty() && !query.isEmpty()) {
+ try {
+ query = downloadPage(query);
+ } catch (CollectorException e) {
+ log.debug("CollectorPlugin.next()-Exception: " + e);
+ throw new RuntimeException(e);
+ }
+ }
+ return recordQueue.poll();
+ }
+ }
+
+ /*
+ * download page and return nextQuery
+ */
+ private String downloadPage(String query) throws CollectorException {
+ String resultJson;
+ String resultXml = "";
+ String nextQuery = "";
+ String emptyXml = resultXml + "<" + JsonUtils.wrapName + ">" + JsonUtils.wrapName + ">";
+ Node resultNode = null;
+ NodeList nodeList = null;
+ String qUrlArgument = "";
+ int urlOldResumptionSize = 0;
+ InputStream theHttpInputStream;
+
+ // check if cursor=* is initial set otherwise add it to the queryParam URL
+ if (resumptionType.equalsIgnoreCase("deep-cursor")) {
+ log.debug("check resumptionType deep-cursor and check cursor=*?" + query);
+ if (!query.contains("&cursor=")) {
+ query += "&cursor=*";
+ }
+ }
+
+ try {
+ log.info("requestig URL [{}]", query);
+
+ URL qUrl = new URL(query);
+ log.debug("authMethod :" + authMethod);
+ if ("bearer".equalsIgnoreCase(this.authMethod)) {
+ log.trace("authMethod before inputStream: " + resultXml);
+ HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+ conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
+ conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
+ conn.setRequestMethod("GET");
+ theHttpInputStream = conn.getInputStream();
+ } else if (BASIC.equalsIgnoreCase(this.authMethod)) {
+ log.trace("authMethod before inputStream: " + resultXml);
+ HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+ conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
+ conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
+ conn.setRequestMethod("GET");
+ theHttpInputStream = conn.getInputStream();
+ } else {
+ theHttpInputStream = qUrl.openStream();
+ }
+
+ resultStream = theHttpInputStream;
+ if ("json".equals(resultOutputFormat)) {
+ resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
+ resultXml = jsonUtils.convertToXML(resultJson);
+ resultStream = IOUtils.toInputStream(resultXml, UTF_8);
+ }
+
+ if (!(emptyXml).equalsIgnoreCase(resultXml)) {
+ resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
+ nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
+ log.debug("nodeList.length: " + nodeList.getLength());
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ StringWriter sw = new StringWriter();
+ transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
+ String toEnqueue = sw.toString();
+ if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
+ log.warn("The following record resulted in empty item for the feeding queue: " + resultXml);
+ } else {
+ recordQueue.add(sw.toString());
+ }
+ }
+ } else {
+ log.warn("resultXml is equal with emptyXml");
+ }
+
+ resumptionInt += resultSizeValue;
+
+ switch (resumptionType.toLowerCase()) {
+ case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
+ resumptionStr = xprResumptionPath.evaluate(resultNode);
+ break;
+
+ case "count": // begin at one step for all records, iterate over items
+ resumptionStr = Integer.toString(resumptionInt);
+ break;
+
+ case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
+ if (resultSizeValue < 2) {
+ throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
+ }
+ qUrlArgument = qUrl.getQuery();
+ String[] arrayQUrlArgument = qUrlArgument.split("&");
+ for (String arrayUrlArgStr : arrayQUrlArgument) {
+ if (arrayUrlArgStr.startsWith(resumptionParam)) {
+ String[] resumptionKeyValue = arrayUrlArgStr.split("=");
+ if (isInteger(resumptionKeyValue[1])) {
+ urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
+ log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
+ } else {
+ log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
+ }
+ }
+ }
+
+ if (((emptyXml).equalsIgnoreCase(resultXml))
+ || ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
+ // resumptionStr = "";
+ if (nodeList != null) {
+ discoverResultSize += nodeList.getLength();
+ }
+ resultTotal = discoverResultSize;
+ } else {
+ resumptionStr = Integer.toString(resumptionInt);
+ resultTotal = resumptionInt + 1;
+ if (nodeList != null) {
+ discoverResultSize += nodeList.getLength();
+ }
+ }
+ log.info("discoverResultSize: {}", discoverResultSize);
+ break;
+
+ case "pagination":
+ case "page": // pagination, iterate over page numbers
+ pagination += 1;
+ if (nodeList != null) {
+ discoverResultSize += nodeList.getLength();
+ } else {
+ resultTotal = discoverResultSize;
+ pagination = discoverResultSize;
+ }
+ resumptionInt = pagination;
+ resumptionStr = Integer.toString(resumptionInt);
+ break;
+
+ case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
+ // solr)
+ // isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
+ // deep-cursor, Param 'resultSizeValue' is less than 2");}
+
+ resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
+ queryParams = queryParams.replace("&cursor=*", "");
+
+ // terminating if length of nodeList is 0
+ if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
+ resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
+ } else {
+ resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
+ // because the iteration is over
+ // real length and the
+ // resultSizeValue is added before
+ // the switch()
+ }
+
+ discoverResultSize = nodeList.getLength();
+
+ log
+ .debug(
+ "downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
+ + queryParams + " resumptionLengthIncreased: " + resumptionInt);
+
+ break;
+
+ default: // otherwise: abort
+ // resultTotal = resumptionInt;
+ break;
+ }
+
+ } catch (Exception e) {
+ log.error(e.getMessage(), e);
+ throw new IllegalStateException("collection failed: " + e.getMessage());
+ }
+
+ try {
+ if (resultTotal == -1) {
+ resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
+ if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
+ resultTotal += 1;
+ } // to correct the upper bound
+ log.info("resultTotal was -1 is now: " + resultTotal);
+ }
+ } catch (Exception e) {
+ log.error(e.getMessage(), e);
+ throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
+ }
+ log.debug("resultTotal: " + resultTotal);
+ log.debug("resInt: " + resumptionInt);
+ if (resumptionInt <= resultTotal) {
+ nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
+ + queryFormat;
+ } else {
+ nextQuery = "";
+ // if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
+ // resumptionInt and prevent a NullPointer Exception at mdStore
+ }
+ log.debug("nextQueryUrl: " + nextQuery);
+ return nextQuery;
+
+ }
+
+ private boolean isInteger(String s) {
+ boolean isValidInteger = false;
+ try {
+ Integer.parseInt(s);
+
+ // s is a valid integer
+
+ isValidInteger = true;
+ } catch (NumberFormatException ex) {
+ // s is not an integer
+ }
+
+ return isValidInteger;
+ }
+
+ // Method to encode a string value using `UTF-8` encoding scheme
+ private String encodeValue(String value) {
+ try {
+ return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
+ } catch (UnsupportedEncodingException ex) {
+ throw new RuntimeException(ex.getCause());
+ }
+ }
+
+ public String getResultFormatValue() {
+ return resultFormatValue;
+ }
+
+ public String getResultOutputFormat() {
+ return resultOutputFormat;
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java
deleted file mode 100644
index e686ad5180..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java
+++ /dev/null
@@ -1,139 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.collector.worker.model.ApiDescriptor;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
-import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
-import eu.dnetlib.message.Message;
-import eu.dnetlib.message.MessageManager;
-import eu.dnetlib.message.MessageType;
-
-public class DnetCollectorWorker {
-
- private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class);
-
- private final CollectorPluginFactory collectorPluginFactory;
-
- private final ArgumentApplicationParser argumentParser;
-
- private final MessageManager manager;
-
- public DnetCollectorWorker(
- final CollectorPluginFactory collectorPluginFactory,
- final ArgumentApplicationParser argumentParser,
- final MessageManager manager)
- throws DnetCollectorException {
- this.collectorPluginFactory = collectorPluginFactory;
- this.argumentParser = argumentParser;
- this.manager = manager;
- }
-
- public void collect() throws DnetCollectorException {
- try {
- final ObjectMapper jsonMapper = new ObjectMapper();
- final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class);
-
- final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol());
-
- final String hdfsuri = argumentParser.get("namenode");
-
- // ====== Init HDFS File System Object
- Configuration conf = new Configuration();
- // Set FileSystem URI
- conf.set("fs.defaultFS", hdfsuri);
- // Because of Maven
- conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
- conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
-
- System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS"));
- System.setProperty("hadoop.home.dir", "/");
- // Get the filesystem - HDFS
- FileSystem.get(URI.create(hdfsuri), conf);
- Path hdfswritepath = new Path(argumentParser.get("hdfsPath"));
-
- log.info("Created path " + hdfswritepath.toString());
-
- final Map ongoingMap = new HashMap<>();
- final Map reportMap = new HashMap<>();
- final AtomicInteger counter = new AtomicInteger(0);
- try (SequenceFile.Writer writer = SequenceFile
- .createWriter(
- conf,
- SequenceFile.Writer.file(hdfswritepath),
- SequenceFile.Writer.keyClass(IntWritable.class),
- SequenceFile.Writer.valueClass(Text.class))) {
- final IntWritable key = new IntWritable(counter.get());
- final Text value = new Text();
- plugin
- .collect(api)
- .forEach(
- content -> {
- key.set(counter.getAndIncrement());
- value.set(content);
- if (counter.get() % 10 == 0) {
- try {
- ongoingMap.put("ongoing", "" + counter.get());
- log
- .debug(
- "Sending message: "
- + manager
- .sendMessage(
- new Message(
- argumentParser.get("workflowId"),
- "Collection",
- MessageType.ONGOING,
- ongoingMap),
- argumentParser.get("rabbitOngoingQueue"),
- true,
- false));
- } catch (Exception e) {
- log.error("Error on sending message ", e);
- }
- }
- try {
- writer.append(key, value);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- });
- }
- ongoingMap.put("ongoing", "" + counter.get());
- manager
- .sendMessage(
- new Message(
- argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap),
- argumentParser.get("rabbitOngoingQueue"),
- true,
- false);
- reportMap.put("collected", "" + counter.get());
- manager
- .sendMessage(
- new Message(
- argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
- argumentParser.get("rabbitOngoingQueue"),
- true,
- false);
- manager.close();
- } catch (Throwable e) {
- throw new DnetCollectorException("Error on collecting ", e);
- }
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java
deleted file mode 100644
index da30e87937..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java
+++ /dev/null
@@ -1,49 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker;
-
-import org.apache.commons.io.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
-import eu.dnetlib.message.MessageManager;
-
-/**
- * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
- * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector
- * plugin to use and where store the data into HDFS path
- *
- * @author Sandro La Bruzzo
- */
-public class DnetCollectorWorkerApplication {
-
- private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class);
-
- private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
-
- private static ArgumentApplicationParser argumentParser;
-
- /** @param args */
- public static void main(final String[] args) throws Exception {
-
- argumentParser = new ArgumentApplicationParser(
- IOUtils
- .toString(
- DnetCollectorWorker.class
- .getResourceAsStream(
- "/eu/dnetlib/collector/worker/collector_parameter.json")));
- argumentParser.parseArgument(args);
- log.info("hdfsPath =" + argumentParser.get("hdfsPath"));
- log.info("json = " + argumentParser.get("apidescriptor"));
- final MessageManager manager = new MessageManager(
- argumentParser.get("rabbitHost"),
- argumentParser.get("rabbitUser"),
- argumentParser.get("rabbitPassword"),
- false,
- false,
- null);
- final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager);
- worker.collect();
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java
deleted file mode 100644
index dcaf0ea562..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java
+++ /dev/null
@@ -1,19 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker.utils;
-
-import java.util.LinkedList;
-
-public class CollectorPluginErrorLogList extends LinkedList {
-
- private static final long serialVersionUID = -6925786561303289704L;
-
- @Override
- public String toString() {
- String log = "";
- int index = 0;
- for (final String errorMessage : this) {
- log += String.format("Retry #%s: %s / ", index++, errorMessage);
- }
- return log;
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java
deleted file mode 100644
index 7a0028e793..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java
+++ /dev/null
@@ -1,20 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker.utils;
-
-import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
-import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
-import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
-
-public class CollectorPluginFactory {
-
- public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException {
- if (protocol == null)
- throw new DnetCollectorException("protocol cannot be null");
- switch (protocol.toLowerCase().trim()) {
- case "oai":
- return new OaiCollectorPlugin();
- default:
- throw new DnetCollectorException("UNknown protocol");
- }
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java
deleted file mode 100644
index 5d6108fad8..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java
+++ /dev/null
@@ -1,244 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker.utils;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.*;
-import java.security.GeneralSecurityException;
-import java.security.cert.X509Certificate;
-import java.util.List;
-import java.util.Map;
-
-import javax.net.ssl.HttpsURLConnection;
-import javax.net.ssl.SSLContext;
-import javax.net.ssl.TrustManager;
-import javax.net.ssl.X509TrustManager;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang.math.NumberUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
-
-public class HttpConnector {
-
- private static final Log log = LogFactory.getLog(HttpConnector.class);
-
- private int maxNumberOfRetry = 6;
- private int defaultDelay = 120; // seconds
- private int readTimeOut = 120; // seconds
-
- private String responseType = null;
-
- private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
-
- public HttpConnector() {
- CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
- }
-
- /**
- * Given the URL returns the content via HTTP GET
- *
- * @param requestUrl the URL
- * @return the content of the downloaded resource
- * @throws DnetCollectorException when retrying more than maxNumberOfRetry times
- */
- public String getInputSource(final String requestUrl) throws DnetCollectorException {
- return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
- }
-
- /**
- * Given the URL returns the content as a stream via HTTP GET
- *
- * @param requestUrl the URL
- * @return the content of the downloaded resource as InputStream
- * @throws DnetCollectorException when retrying more than maxNumberOfRetry times
- */
- public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException {
- return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
- }
-
- private String attemptDownlaodAsString(
- final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
- throws DnetCollectorException {
- try {
- final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
- try {
- return IOUtils.toString(s);
- } catch (final IOException e) {
- log.error("error while retrieving from http-connection occured: " + requestUrl, e);
- Thread.sleep(defaultDelay * 1000);
- errorList.add(e.getMessage());
- return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
- } finally {
- IOUtils.closeQuietly(s);
- }
- } catch (final InterruptedException e) {
- throw new DnetCollectorException(e);
- }
- }
-
- private InputStream attemptDownload(
- final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
- throws DnetCollectorException {
-
- if (retryNumber > maxNumberOfRetry) {
- throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList);
- }
-
- log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
- try {
- InputStream input = null;
-
- try {
- final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
- urlConn.setInstanceFollowRedirects(false);
- urlConn.setReadTimeout(readTimeOut * 1000);
- urlConn.addRequestProperty("User-Agent", userAgent);
-
- if (log.isDebugEnabled()) {
- logHeaderFields(urlConn);
- }
-
- final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
- if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
- log.warn("waiting and repeating request after " + retryAfter + " sec.");
- Thread.sleep(retryAfter * 1000);
- errorList.add("503 Service Unavailable");
- urlConn.disconnect();
- return attemptDownload(requestUrl, retryNumber + 1, errorList);
- } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM
- || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
- final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
- log.debug("The requested url has been moved to " + newUrl);
- errorList
- .add(
- String
- .format(
- "%s %s. Moved to: %s",
- urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
- urlConn.disconnect();
- return attemptDownload(newUrl, retryNumber + 1, errorList);
- } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
- log
- .error(
- String
- .format(
- "HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
- Thread.sleep(defaultDelay * 1000);
- errorList
- .add(
- String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
- urlConn.disconnect();
- return attemptDownload(requestUrl, retryNumber + 1, errorList);
- } else {
- input = urlConn.getInputStream();
- responseType = urlConn.getContentType();
- return input;
- }
- } catch (final IOException e) {
- log.error("error while retrieving from http-connection occured: " + requestUrl, e);
- Thread.sleep(defaultDelay * 1000);
- errorList.add(e.getMessage());
- return attemptDownload(requestUrl, retryNumber + 1, errorList);
- }
- } catch (final InterruptedException e) {
- throw new DnetCollectorException(e);
- }
- }
-
- private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
- log.debug("StatusCode: " + urlConn.getResponseMessage());
-
- for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) {
- if (e.getKey() != null) {
- for (final String v : e.getValue()) {
- log.debug(" key: " + e.getKey() + " - value: " + v);
- }
- }
- }
- }
-
- private int obtainRetryAfter(final Map> headerMap) {
- for (final String key : headerMap.keySet()) {
- if (key != null
- && key.toLowerCase().equals("retry-after")
- && headerMap.get(key).size() > 0
- && NumberUtils.isNumber(headerMap.get(key).get(0))) {
- return Integer.parseInt(headerMap.get(key).get(0)) + 10;
- }
- }
- return -1;
- }
-
- private String obtainNewLocation(final Map> headerMap)
- throws DnetCollectorException {
- for (final String key : headerMap.keySet()) {
- if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) {
- return headerMap.get(key).get(0);
- }
- }
- throw new DnetCollectorException(
- "The requested url has been MOVED, but 'location' param is MISSING");
- }
-
- /**
- * register for https scheme; this is a workaround and not intended for the use in trusted environments
- */
- public void initTrustManager() {
- final X509TrustManager tm = new X509TrustManager() {
-
- @Override
- public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
- }
-
- @Override
- public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
- }
-
- @Override
- public X509Certificate[] getAcceptedIssuers() {
- return null;
- }
- };
- try {
- final SSLContext ctx = SSLContext.getInstance("TLS");
- ctx.init(null, new TrustManager[] {
- tm
- }, null);
- HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
- } catch (final GeneralSecurityException e) {
- log.fatal(e);
- throw new IllegalStateException(e);
- }
- }
-
- public int getMaxNumberOfRetry() {
- return maxNumberOfRetry;
- }
-
- public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
- this.maxNumberOfRetry = maxNumberOfRetry;
- }
-
- public int getDefaultDelay() {
- return defaultDelay;
- }
-
- public void setDefaultDelay(final int defaultDelay) {
- this.defaultDelay = defaultDelay;
- }
-
- public int getReadTimeOut() {
- return readTimeOut;
- }
-
- public void setReadTimeOut(final int readTimeOut) {
- this.readTimeOut = readTimeOut;
- }
-
- public String getResponseType() {
- return responseType;
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java
new file mode 100644
index 0000000000..45bd844e2d
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java
@@ -0,0 +1,29 @@
+
+package eu.dnetlib.dhp.transformation;
+
+public class DnetTransformationException extends Exception {
+
+ public DnetTransformationException() {
+ super();
+ }
+
+ public DnetTransformationException(
+ final String message,
+ final Throwable cause,
+ final boolean enableSuppression,
+ final boolean writableStackTrace) {
+ super(message, cause, enableSuppression, writableStackTrace);
+ }
+
+ public DnetTransformationException(final String message, final Throwable cause) {
+ super(message, cause);
+ }
+
+ public DnetTransformationException(final String message) {
+ super(message);
+ }
+
+ public DnetTransformationException(final Throwable cause) {
+ super(cause);
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java
deleted file mode 100644
index f4bf78e189..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java
+++ /dev/null
@@ -1,74 +0,0 @@
-
-package eu.dnetlib.dhp.transformation;
-
-import java.io.ByteArrayInputStream;
-import java.io.StringWriter;
-import java.util.Map;
-
-import javax.xml.transform.stream.StreamSource;
-
-import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.util.LongAccumulator;
-
-import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
-import eu.dnetlib.dhp.transformation.functions.Cleaner;
-import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
-import net.sf.saxon.s9api.*;
-
-public class TransformFunction implements MapFunction {
-
- private final LongAccumulator totalItems;
- private final LongAccumulator errorItems;
- private final LongAccumulator transformedItems;
- private final String transformationRule;
- private final Cleaner cleanFunction;
-
- private final long dateOfTransformation;
-
- public TransformFunction(
- LongAccumulator totalItems,
- LongAccumulator errorItems,
- LongAccumulator transformedItems,
- final String transformationRule,
- long dateOfTransformation,
- final Map vocabularies)
- throws Exception {
- this.totalItems = totalItems;
- this.errorItems = errorItems;
- this.transformedItems = transformedItems;
- this.transformationRule = transformationRule;
- this.dateOfTransformation = dateOfTransformation;
- cleanFunction = new Cleaner(vocabularies);
- }
-
- @Override
- public MetadataRecord call(MetadataRecord value) {
- totalItems.add(1);
- try {
- Processor processor = new Processor(false);
- processor.registerExtensionFunction(cleanFunction);
- final XsltCompiler comp = processor.newXsltCompiler();
- XsltExecutable xslt = comp
- .compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
- XdmNode source = processor
- .newDocumentBuilder()
- .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes())));
- XsltTransformer trans = xslt.load();
- trans.setInitialContextNode(source);
- final StringWriter output = new StringWriter();
- Serializer out = processor.newSerializer(output);
- out.setOutputProperty(Serializer.Property.METHOD, "xml");
- out.setOutputProperty(Serializer.Property.INDENT, "yes");
- trans.setDestination(out);
- trans.transform();
- final String xml = output.toString();
- value.setBody(xml);
- value.setDateOfTransformation(dateOfTransformation);
- transformedItems.add(1);
- return value;
- } catch (Throwable e) {
- errorItems.add(1);
- return null;
- }
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java
index 8737d36ef1..c7201a2674 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java
@@ -1,43 +1,43 @@
package eu.dnetlib.dhp.transformation;
+import static eu.dnetlib.dhp.common.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.utils.DHPUtils.*;
-import java.io.ByteArrayInputStream;
-import java.util.HashMap;
+import java.io.IOException;
import java.util.Map;
-import java.util.Objects;
import java.util.Optional;
-import org.apache.commons.cli.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
-import org.dom4j.Document;
-import org.dom4j.DocumentException;
-import org.dom4j.Node;
-import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob;
-import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
-import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
-import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
-import eu.dnetlib.dhp.utils.DHPUtils;
-import eu.dnetlib.message.Message;
-import eu.dnetlib.message.MessageManager;
-import eu.dnetlib.message.MessageType;
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.message.MessageSender;
+import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion;
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class TransformSparkJobNode {
private static final Logger log = LoggerFactory.getLogger(TransformSparkJobNode.class);
+ private static final int RECORDS_PER_TASK = 200;
+
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@@ -55,67 +55,107 @@ public class TransformSparkJobNode {
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
- final String inputPath = parser.get("input");
- final String outputPath = parser.get("output");
- final String workflowId = parser.get("workflowId");
- final String trasformationRule = extractXSLTFromTR(
- Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule"))));
+ final String mdstoreInputVersion = parser.get("mdstoreInputVersion");
+ final String mdstoreOutputVersion = parser.get("mdstoreOutputVersion");
- final String rabbitUser = parser.get("rabbitUser");
- final String rabbitPassword = parser.get("rabbitPassword");
- final String rabbitHost = parser.get("rabbitHost");
- final String rabbitReportQueue = parser.get("rabbitReportQueue");
- final long dateOfCollection = new Long(parser.get("dateOfCollection"));
- final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
+ final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class);
+ final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH;
+ log.info("inputPath: {}", inputPath);
+
+ final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class);
+ final String outputBasePath = cleanedMdStoreVersion.getHdfsPath();
+ log.info("outputBasePath: {}", outputBasePath);
+
+ final String isLookupUrl = parser.get("isLookupUrl");
+ log.info(String.format("isLookupUrl: %s", isLookupUrl));
+
+ final String dateOfTransformation = parser.get("dateOfTransformation");
+ log.info(String.format("dateOfTransformation: %s", dateOfTransformation));
+
+ final Integer rpt = Optional
+ .ofNullable(parser.get("recordsPerTask"))
+ .map(Integer::valueOf)
+ .orElse(RECORDS_PER_TASK);
+
+ final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
+
+ final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
+
+ log.info("Retrieved {} vocabularies", vocabularies.vocabularyNames().size());
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
- final Encoder encoder = Encoders.bean(MetadataRecord.class);
- final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder);
- final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems");
- final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems");
- final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems");
- final Map vocabularies = new HashMap<>();
- vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages"));
- final TransformFunction transformFunction = new TransformFunction(
- totalItems,
- errorItems,
- transformedItems,
- trasformationRule,
- dateOfCollection,
- vocabularies);
- mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath);
- if (rabbitHost != null) {
- System.out.println("SEND FINAL REPORT");
- final Map reportMap = new HashMap<>();
- reportMap.put("inputItem", "" + totalItems.value());
- reportMap.put("invalidRecords", "" + errorItems.value());
- reportMap.put("mdStoreSize", "" + transformedItems.value());
- System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap));
- if (!test) {
- final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false,
- false,
- null);
- manager
- .sendMessage(
- new Message(workflowId, "Transform", MessageType.REPORT, reportMap),
- rabbitReportQueue,
- true,
- false);
- manager.close();
- }
- }
+ transformRecords(
+ parser.getObjectMap(), isLookupService, spark, inputPath, outputBasePath, rpt);
});
-
}
- private static String extractXSLTFromTR(final String tr) throws DocumentException {
- SAXReader reader = new SAXReader();
- Document document = reader.read(new ByteArrayInputStream(tr.getBytes()));
- Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']");
- return node.asXML();
+ public static void transformRecords(final Map args, final ISLookUpService isLookUpService,
+ final SparkSession spark, final String inputPath, final String outputBasePath, final Integer rpt)
+ throws DnetTransformationException, IOException {
+
+ final LongAccumulator totalItems = spark.sparkContext().longAccumulator(CONTENT_TOTALITEMS);
+ final LongAccumulator errorItems = spark.sparkContext().longAccumulator(CONTENT_INVALIDRECORDS);
+ final LongAccumulator transformedItems = spark.sparkContext().longAccumulator(CONTENT_TRANSFORMEDRECORDS);
+ final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems);
+ final Encoder encoder = Encoders.bean(MetadataRecord.class);
+
+ final String dnetMessageManagerURL = args.get(DNET_MESSAGE_MGR_URL);
+ log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
+
+ final String workflowId = args.get("workflowId");
+ log.info("workflowId is {}", workflowId);
+
+ MapFunction x = TransformationFactory
+ .getTransformationPlugin(args, ct, isLookUpService);
+
+ final Dataset inputMDStore = spark
+ .read()
+ .format("parquet")
+ .load(inputPath)
+ .as(encoder);
+
+ final long totalInput = inputMDStore.count();
+
+ final MessageSender messageSender = new MessageSender(dnetMessageManagerURL, workflowId);
+ try (AggregatorReport report = new AggregatorReport(messageSender)) {
+ try {
+ JavaRDD mdstore = inputMDStore
+ .javaRDD()
+ .repartition(getRepartitionNumber(totalInput, rpt))
+ .map((Function) x::call);
+ saveDataset(spark.createDataset(mdstore.rdd(), encoder), outputBasePath + MDSTORE_DATA_PATH);
+
+ log.info("Transformed item " + ct.getProcessedItems().count());
+ log.info("Total item " + ct.getTotalItems().count());
+ log.info("Transformation Error item " + ct.getErrorItems().count());
+
+ final long mdStoreSize = spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count();
+ writeHdfsFile(
+ spark.sparkContext().hadoopConfiguration(),
+ "" + mdStoreSize, outputBasePath + MDSTORE_SIZE_PATH);
+ } catch (Throwable e) {
+ log.error("error during record transformation", e);
+ report.put(TransformSparkJobNode.class.getSimpleName(), e.getMessage());
+ report.put(CONTENT_TOTALITEMS, ct.getTotalItems().value().toString());
+ report.put(CONTENT_INVALIDRECORDS, ct.getErrorItems().value().toString());
+ report.put(CONTENT_TRANSFORMEDRECORDS, ct.getProcessedItems().value().toString());
+ throw e;
+ }
+ }
}
+
+ /**
+ * Calculates the number of partitions allocating at most @rpt records for a single transformation task.
+ * @param totalInput
+ * @param rpt
+ * @return
+ */
+ private static int getRepartitionNumber(long totalInput, Integer rpt) {
+ return Math.max(1, (int) (totalInput / rpt));
+ }
+
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java
new file mode 100644
index 0000000000..096d0e2896
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java
@@ -0,0 +1,69 @@
+
+package eu.dnetlib.dhp.transformation;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.api.java.function.MapFunction;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
+import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class TransformationFactory {
+
+ private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class);
+ public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/*[local-name() =\"stylesheet\"]";
+
+ public static MapFunction getTransformationPlugin(
+ final Map jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService)
+ throws DnetTransformationException {
+
+ try {
+ final String transformationPlugin = jobArgument.get("transformationPlugin");
+
+ log.info("Transformation plugin required " + transformationPlugin);
+ switch (transformationPlugin) {
+ case "XSLT_TRANSFORM": {
+ final String transformationRuleId = jobArgument.get("transformationRuleId");
+ if (StringUtils.isBlank(transformationRuleId))
+ throw new DnetTransformationException("Missing Parameter transformationRule");
+ final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
+
+ final String transformationRule = queryTransformationRuleFromIS(
+ transformationRuleId, isLookupService);
+
+ final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation"));
+ return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation,
+ vocabularies);
+
+ }
+ default:
+ throw new DnetTransformationException(
+ "transformation plugin does not exists for " + transformationPlugin);
+
+ }
+
+ } catch (Throwable e) {
+ throw new DnetTransformationException(e);
+ }
+ }
+
+ private static String queryTransformationRuleFromIS(final String transformationRuleId,
+ final ISLookUpService isLookUpService) throws Exception {
+ final String query = String.format(TRULE_XQUERY, transformationRuleId);
+ System.out.println("asking query to IS: " + query);
+ List result = isLookUpService.quickSearchProfile(query);
+
+ if (result == null || result.isEmpty())
+ throw new DnetTransformationException(
+ "Unable to find transformation rule with name: " + transformationRuleId);
+ return result.get(0);
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java
deleted file mode 100644
index b5ac18169e..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java
+++ /dev/null
@@ -1,53 +0,0 @@
-
-package eu.dnetlib.dhp.transformation.vocabulary;
-
-import java.io.Serializable;
-
-public class Term implements Serializable {
-
- private String englishName;
- private String nativeName;
- private String encoding;
- private String code;
- private String synonyms;
-
- public String getEnglishName() {
- return englishName;
- }
-
- public void setEnglishName(String englishName) {
- this.englishName = englishName;
- }
-
- public String getNativeName() {
- return nativeName;
- }
-
- public void setNativeName(String nativeName) {
- this.nativeName = nativeName;
- }
-
- public String getEncoding() {
- return encoding;
- }
-
- public void setEncoding(String encoding) {
- this.encoding = encoding;
- }
-
- public String getCode() {
- return code;
- }
-
- public void setCode(String code) {
- this.code = code;
- }
-
- public String getSynonyms() {
- return synonyms;
- }
-
- public void setSynonyms(String synonyms) {
- this.synonyms = synonyms;
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java
deleted file mode 100644
index a9da6b7256..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java
+++ /dev/null
@@ -1,54 +0,0 @@
-
-package eu.dnetlib.dhp.transformation.vocabulary;
-
-import java.io.Serializable;
-import java.util.List;
-
-public class Vocabulary implements Serializable {
-
- private String id;
- private String name;
- private String description;
- private String code;
- private List terms;
-
- public String getId() {
- return id;
- }
-
- public void setId(String id) {
- this.id = id;
- }
-
- public String getName() {
- return name;
- }
-
- public void setName(String name) {
- this.name = name;
- }
-
- public String getDescription() {
- return description;
- }
-
- public void setDescription(String description) {
- this.description = description;
- }
-
- public String getCode() {
- return code;
- }
-
- public void setCode(String code) {
- this.code = code;
- }
-
- public List getTerms() {
- return terms;
- }
-
- public void setTerms(List terms) {
- this.terms = terms;
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java
deleted file mode 100644
index 10e959be05..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java
+++ /dev/null
@@ -1,24 +0,0 @@
-
-package eu.dnetlib.dhp.transformation.vocabulary;
-
-import java.io.Serializable;
-import java.net.URL;
-import java.nio.charset.Charset;
-
-import org.apache.commons.io.IOUtils;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-public class VocabularyHelper implements Serializable {
-
- private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json";
-
- public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception {
- final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName));
-
- final String response = IOUtils.toString(url, Charset.defaultCharset());
- final ObjectMapper jsonMapper = new ObjectMapper();
- final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class);
- return vocabulary;
- }
-}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java
similarity index 54%
rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java
index 7f9b6646c3..664215c0e3 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java
@@ -1,25 +1,25 @@
-package eu.dnetlib.dhp.transformation.functions;
+package eu.dnetlib.dhp.transformation.xslt;
-import java.util.Map;
-import java.util.Optional;
+import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
-import eu.dnetlib.dhp.transformation.vocabulary.Term;
-import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
+import java.io.Serializable;
+
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
import net.sf.saxon.s9api.*;
-import scala.Serializable;
public class Cleaner implements ExtensionFunction, Serializable {
- private final Map vocabularies;
+ private final VocabularyGroup vocabularies;
- public Cleaner(Map vocabularies) {
+ public Cleaner(final VocabularyGroup vocabularies) {
this.vocabularies = vocabularies;
}
@Override
public QName getName() {
- return new QName("http://eu/dnetlib/trasform/extension", "clean");
+ return new QName(QNAME_BASE_URI + "/clean", "clean");
}
@Override
@@ -30,23 +30,22 @@ public class Cleaner implements ExtensionFunction, Serializable {
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] {
- SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE),
+ SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE),
SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE)
};
}
@Override
public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
+ XdmValue r = xdmValues[0];
+ if (r.size() == 0) {
+ return new XdmAtomicValue("");
+ }
final String currentValue = xdmValues[0].itemAt(0).getStringValue();
final String vocabularyName = xdmValues[1].itemAt(0).getStringValue();
- Optional cleanedValue = vocabularies
- .get(vocabularyName)
- .getTerms()
- .stream()
- .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue))
- .findAny();
+ Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue);
return new XdmAtomicValue(
- cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue);
+ cleanedValue != null ? cleanedValue.getClassid() : currentValue);
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java
new file mode 100644
index 0000000000..9da0747e65
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java
@@ -0,0 +1,49 @@
+
+package eu.dnetlib.dhp.transformation.xslt;
+
+import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
+
+import java.io.Serializable;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
+import net.sf.saxon.s9api.*;
+
+public class DateCleaner implements ExtensionFunction, Serializable {
+
+ @Override
+ public QName getName() {
+ return new QName(QNAME_BASE_URI + "/dateISO", "dateISO");
+ }
+
+ @Override
+ public SequenceType getResultType() {
+ return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
+ }
+
+ @Override
+ public SequenceType[] getArgumentTypes() {
+ return new SequenceType[] {
+ SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
+ };
+ }
+
+ @Override
+ public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
+ XdmValue r = xdmValues[0];
+ if (r.size() == 0) {
+ return new XdmAtomicValue("");
+ }
+ final String currentValue = xdmValues[0].itemAt(0).getStringValue();
+ return new XdmAtomicValue(clean(currentValue));
+ }
+
+ // for backward compatibility with the existing unit tests
+ public String clean(String date) {
+ return GraphCleaningFunctions.cleanDate(date);
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java
new file mode 100644
index 0000000000..e3d5888586
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java
@@ -0,0 +1,198 @@
+
+package eu.dnetlib.dhp.transformation.xslt;
+
+import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
+
+import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.text.Normalizer;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.hash.Hashing;
+
+import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize;
+import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations;
+import net.sf.saxon.s9api.*;
+
+public class PersonCleaner implements ExtensionFunction, Serializable {
+
+ private static final long serialVersionUID = 1L;
+ private List firstname = Lists.newArrayList();
+ private List surname = Lists.newArrayList();
+ private List fullname = Lists.newArrayList();
+
+ private static final Set particles = null;
+
+ public PersonCleaner() {
+
+ }
+
+ private String normalize(String s) {
+ s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
+ s = s.replaceAll("\\(.+\\)", "");
+ s = s.replaceAll("\\[.+\\]", "");
+ s = s.replaceAll("\\{.+\\}", "");
+ s = s.replaceAll("\\s+-\\s+", "-");
+
+// s = s.replaceAll("[\\W&&[^,-]]", " ");
+
+// System.out.println("class Person: s: " + s);
+
+// s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " ");
+ s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
+ s = s.replace("\\d", " ");
+ s = s.replace("\\n", " ");
+ s = s.replace("\\.", " ");
+ s = s.replaceAll("\\s+", " ");
+
+ if (s.contains(",")) {
+ // System.out.println("class Person: s: " + s);
+
+ String[] arr = s.split(",");
+ if (arr.length == 1) {
+
+ fullname = splitTerms(arr[0]);
+ } else if (arr.length > 1) {
+ surname = splitTerms(arr[0]);
+ firstname = splitTermsFirstName(arr[1]);
+// System.out.println("class Person: surname: " + surname);
+// System.out.println("class Person: firstname: " + firstname);
+
+ fullname.addAll(surname);
+ fullname.addAll(firstname);
+ }
+ } else {
+ fullname = splitTerms(s);
+
+ int lastInitialPosition = fullname.size();
+ boolean hasSurnameInUpperCase = false;
+
+ for (int i = 0; i < fullname.size(); i++) {
+ String term = fullname.get(i);
+ if (term.length() == 1) {
+ lastInitialPosition = i;
+ } else if (term.equals(term.toUpperCase())) {
+ hasSurnameInUpperCase = true;
+ }
+ }
+ if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
+ firstname = fullname.subList(0, lastInitialPosition + 1);
+ System.out.println("name: " + firstname);
+ surname = fullname.subList(lastInitialPosition + 1, fullname.size());
+ } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
+ for (String term : fullname) {
+ if (term.length() > 1 && term.equals(term.toUpperCase())) {
+ surname.add(term);
+ } else {
+ firstname.add(term);
+ }
+ }
+ } else if (lastInitialPosition == fullname.size()) {
+ surname = fullname.subList(lastInitialPosition - 1, fullname.size());
+ firstname = fullname.subList(0, lastInitialPosition - 1);
+ }
+
+ }
+ return null;
+ }
+
+ private List splitTermsFirstName(String s) {
+ List list = Lists.newArrayList();
+ for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
+ if (s.trim().matches("\\p{Lu}{2,3}")) {
+ String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
+ for (String p : parts) {
+ if (p.length() > 0)
+ list.add(p);
+ }
+ } else {
+ list.add(part);
+ }
+
+ }
+ return list;
+ }
+
+ private List splitTerms(String s) {
+ if (particles == null) {
+ // particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
+ }
+
+ List list = Lists.newArrayList();
+ for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
+ // if (!particles.contains(part.toLowerCase())) {
+ list.add(part);
+
+ // }
+ }
+ return list;
+ }
+
+ public List getFirstname() {
+ return firstname;
+ }
+
+ public List getSurname() {
+ return surname;
+ }
+
+ public List getFullname() {
+ return fullname;
+ }
+
+ public String hash() {
+ return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString();
+ }
+
+ public String getNormalisedFullname() {
+ return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations())
+ : Joiner.on(" ").join(fullname);
+ // return isAccurate() ?
+ // Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) :
+ // Joiner.on(" ").join(fullname);
+ }
+
+ public List getCapitalSurname() {
+ return Lists.newArrayList(Iterables.transform(surname, new Capitalize()));
+ }
+
+ public List getNameWithAbbreviations() {
+ return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations()));
+ }
+
+ public boolean isAccurate() {
+ return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty());
+ }
+
+ @Override
+ public QName getName() {
+ return new QName(QNAME_BASE_URI + "/person", "normalize");
+ }
+
+ @Override
+ public SequenceType getResultType() {
+ return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
+ }
+
+ @Override
+ public SequenceType[] getArgumentTypes() {
+ return new SequenceType[] {
+ SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
+ };
+ }
+
+ @Override
+ public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
+ XdmValue r = xdmValues[0];
+ if (r.size() == 0) {
+ return new XdmAtomicValue("");
+ }
+ final String currentValue = xdmValues[0].itemAt(0).getStringValue();
+ return new XdmAtomicValue(normalize(currentValue));
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java
new file mode 100644
index 0000000000..43291e5de1
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java
@@ -0,0 +1,109 @@
+
+package eu.dnetlib.dhp.transformation.xslt;
+
+import java.io.ByteArrayInputStream;
+import java.io.Serializable;
+import java.io.StringWriter;
+import java.nio.charset.StandardCharsets;
+
+import javax.xml.transform.stream.StreamSource;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.api.java.function.MapFunction;
+
+import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
+import net.sf.saxon.s9api.*;
+
+public class XSLTTransformationFunction implements MapFunction, Serializable {
+
+ public final static String QNAME_BASE_URI = "http://eu/dnetlib/transform";
+
+ private final static String DATASOURCE_ID_PARAM = "varDataSourceId";
+
+ private final static String DATASOURCE_NAME_PARAM = "varOfficialName";
+
+ private final AggregationCounter aggregationCounter;
+
+ private final String transformationRule;
+
+ private final Cleaner cleanFunction;
+
+ private final long dateOfTransformation;
+
+ private final VocabularyGroup vocabularies;
+
+ public XSLTTransformationFunction(
+ final AggregationCounter aggregationCounter,
+ final String transformationRule,
+ long dateOfTransformation,
+ final VocabularyGroup vocabularies)
+ throws Exception {
+ this.aggregationCounter = aggregationCounter;
+ this.transformationRule = transformationRule;
+ this.vocabularies = vocabularies;
+ this.dateOfTransformation = dateOfTransformation;
+ cleanFunction = new Cleaner(vocabularies);
+ }
+
+ @Override
+ public MetadataRecord call(MetadataRecord value) {
+ aggregationCounter.getTotalItems().add(1);
+ try {
+ Processor processor = new Processor(false);
+
+ processor.registerExtensionFunction(cleanFunction);
+ processor.registerExtensionFunction(new DateCleaner());
+ processor.registerExtensionFunction(new PersonCleaner());
+
+ final XsltCompiler comp = processor.newXsltCompiler();
+ QName datasourceIDParam = new QName(DATASOURCE_ID_PARAM);
+ comp.setParameter(datasourceIDParam, new XdmAtomicValue(value.getProvenance().getDatasourceId()));
+ QName datasourceNameParam = new QName(DATASOURCE_NAME_PARAM);
+ comp.setParameter(datasourceNameParam, new XdmAtomicValue(value.getProvenance().getDatasourceName()));
+ XsltExecutable xslt = comp
+ .compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)));
+ XdmNode source = processor
+ .newDocumentBuilder()
+ .build(new StreamSource(IOUtils.toInputStream(value.getBody(), StandardCharsets.UTF_8)));
+ XsltTransformer trans = xslt.load();
+ trans.setInitialContextNode(source);
+ final StringWriter output = new StringWriter();
+ Serializer out = processor.newSerializer(output);
+ out.setOutputProperty(Serializer.Property.METHOD, "xml");
+ out.setOutputProperty(Serializer.Property.INDENT, "yes");
+
+ trans.setDestination(out);
+ trans.transform();
+ final String xml = output.toString();
+ value.setBody(xml);
+ value.setDateOfTransformation(dateOfTransformation);
+ aggregationCounter.getProcessedItems().add(1);
+ return value;
+ } catch (Throwable e) {
+ aggregationCounter.getErrorItems().add(1);
+ throw new RuntimeException(e);
+ }
+ }
+
+ public AggregationCounter getAggregationCounter() {
+ return aggregationCounter;
+ }
+
+ public String getTransformationRule() {
+ return transformationRule;
+ }
+
+ public Cleaner getCleanFunction() {
+ return cleanFunction;
+ }
+
+ public long getDateOfTransformation() {
+ return dateOfTransformation;
+ }
+
+ public VocabularyGroup getVocabularies() {
+ return vocabularies;
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java
new file mode 100644
index 0000000000..b013bbabd0
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java
@@ -0,0 +1,14 @@
+
+package eu.dnetlib.dhp.transformation.xslt.utils;
+
+// import org.apache.commons.text.WordUtils;
+// import org.apache.commons.text.WordUtils;
+import com.google.common.base.Function;
+
+public class Capitalize implements Function {
+
+ @Override
+ public String apply(String s) {
+ return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase());
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java
new file mode 100644
index 0000000000..01174bf04c
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java
@@ -0,0 +1,12 @@
+
+package eu.dnetlib.dhp.transformation.xslt.utils;
+
+import com.google.common.base.Function;
+
+public class DotAbbreviations implements Function {
+
+ @Override
+ public String apply(String s) {
+ return s.length() == 1 ? s + "." : s;
+ }
+}
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml
similarity index 62%
rename from dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/config-default.xml
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml
index 7c1a43e513..dd3c32c620 100644
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/provision/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml
@@ -1,4 +1,12 @@
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
oozie.use.system.libpath
true
@@ -7,6 +15,7 @@
oozie.action.sharelib.for.spark
spark2
+
oozie.launcher.mapreduce.user.classpath.first
true
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml
new file mode 100644
index 0000000000..3c58ace7bf
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml
@@ -0,0 +1,46 @@
+
+
+
+ sourcePath
+ the working path of Datacite stores
+
+
+ outputPath
+ the path of Datacite ActionSet
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+ yarn-cluster
+ cluster
+ ExportDataset
+ eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}
+ --targetPath${outputPath}
+ --masteryarn-cluster
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter
new file mode 100644
index 0000000000..ad80d69980
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter
@@ -0,0 +1,28 @@
+TUBYDI - Assistir Filmes e Series Online Grátis
+123Movies
+WATCH FULL MOVIE
+Movierulz
+Full Movie Online
+MOVIé WatcH
+The King of Staten Island 2020 Online For Free
+Watch Train to Busan 2 2020 online for free
+Sixth Sense Movie Novelization
+Film Complet streaming vf gratuit en ligne
+watch now free
+LIVE stream watch
+LIVE stream UFC
+RBC Heritage live stream
+MLBStreams Free
+NFL Live Stream
+Live Stream Free
+Royal Ascot 2020 Live Stream
+TV Shows Full Episodes Official
+FuboTV
+Gomovies
+Online Free Trial Access
+123watch
+DÜŞÜK HAPI
+Bebek Düşürme Yöntemleri
+WHATSAP İLETİŞİM
+Cytotec
+düşük hapı
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json
similarity index 52%
rename from dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json
index 6d90ced2cb..63e0803372 100644
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/export/input_maketar_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json
@@ -1,20 +1,21 @@
[
- {
- "paramName": "n",
- "paramLongName": "nameNode",
- "paramDescription": "the Name Node",
- "paramRequired": true
- },
{
"paramName": "s",
"paramLongName": "sourcePath",
- "paramDescription": "the source path",
+ "paramDescription": "the source mdstore path",
"paramRequired": true
},
+
{
"paramName": "t",
"paramLongName": "targetPath",
- "paramDescription": "the target path",
+ "paramDescription": "the target mdstore path",
+ "paramRequired": true
+ },
+ {
+ "paramName": "m",
+ "paramLongName": "master",
+ "paramDescription": "the master name",
"paramRequired": true
}
]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json
new file mode 100644
index 0000000000..63e0803372
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json
@@ -0,0 +1,21 @@
+[
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the source mdstore path",
+ "paramRequired": true
+ },
+
+ {
+ "paramName": "t",
+ "paramLongName": "targetPath",
+ "paramDescription": "the target mdstore path",
+ "paramRequired": true
+ },
+ {
+ "paramName": "m",
+ "paramLongName": "master",
+ "paramDescription": "the master name",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json
new file mode 100644
index 0000000000..67e7f37dcb
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json
@@ -0,0 +1,34 @@
+[
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the source mdstore path",
+ "paramRequired": true
+ },
+
+ {
+ "paramName": "t",
+ "paramLongName": "targetPath",
+ "paramDescription": "the target mdstore path",
+ "paramRequired": true
+ },
+ {
+ "paramName": "m",
+ "paramLongName": "master",
+ "paramDescription": "the master name",
+ "paramRequired": true
+ },
+ {
+ "paramName": "i",
+ "paramLongName": "isLookupUrl",
+ "paramDescription": "the isLookup URL",
+ "paramRequired": true
+ },
+ {
+ "paramName": "l",
+ "paramLongName": "exportLinks",
+ "paramDescription": "should export also links",
+ "paramRequired": false
+ }
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json
new file mode 100644
index 0000000000..ddc70bc536
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json
@@ -0,0 +1,1032 @@
+{
+ "SND.QOG": {
+ "openaire_id": "re3data_____::r3d100012231",
+ "datacite_name": "Quality of Government Institute",
+ "official_name": "Quality of Government Institute's Data",
+ "similarity": 0.8985507246376812
+ },
+ "GESIS.CESSDA": {
+ "openaire_id": "re3data_____::r3d100010202",
+ "datacite_name": "CESSDA ERIC",
+ "official_name": "CESSDA ERIC"
+ },
+ "BL.CRAN": {
+ "openaire_id": "re3data_____::r3d100012068",
+ "datacite_name": "Cranfield University",
+ "official_name": "Cranfield Online Research Data"
+ },
+ "SUL.OPENNEURO": {
+ "openaire_id": "re3data_____::r3d100010924",
+ "datacite_name": "OpenNeuro",
+ "official_name": "OpenNeuro"
+ },
+ "UNAVCO.UNAVCO": {
+ "openaire_id": "re3data_____::r3d100010872",
+ "datacite_name": "UNAVCO",
+ "official_name": "UNAVCO"
+ },
+ "SUL.SDR": {
+ "openaire_id": "re3data_____::r3d100010710",
+ "datacite_name": "Stanford Digital Repository",
+ "official_name": "Stanford Digital Repository"
+ },
+ "DK.ICES": {
+ "openaire_id": "re3data_____::r3d100011288",
+ "datacite_name": "International Council for the Exploration of the Sea (ICES)",
+ "official_name": "International Council for the Exploration of the Sea datasets",
+ "similarity": 0.8833333333333333
+ },
+ "CISTI.DFOSCIMR": {
+ "openaire_id": "re3data_____::r3d100012039",
+ "datacite_name": "Bedford Institute of Oceanography - Fisheries and Oceans Canada - Ocean Data and Information Section",
+ "official_name": "Bedford Institute of Oceanography - Oceanographic Databases"
+ },
+ "CSIC.DIGITAL": {
+ "openaire_id": "re3data_____::r3d100011076",
+ "datacite_name": "Digital CSIC",
+ "official_name": "DIGITAL.CSIC"
+ },
+ "TIB.PANGAEA": {
+ "openaire_id": "re3data_____::r3d100010134",
+ "datacite_name": "PANGAEA",
+ "official_name": "PANGAEA"
+ },
+ "PSU.DATACOM": {
+ "openaire_id": "re3data_____::r3d100010477",
+ "datacite_name": "Data Commons",
+ "official_name": "ANU Data Commons",
+ "similarity": 0.8571428571428571
+ },
+ "ANDS.CENTRE72": {
+ "openaire_id": "re3data_____::r3d100010451",
+ "datacite_name": "PARADISEC",
+ "official_name": "Pacific and Regional Archive for Digital Sources in Endangered Cultures"
+ },
+ "BL.OXDB": {
+ "openaire_id": "re3data_____::r3d100011653",
+ "datacite_name": "Oxford University Library Service Databank",
+ "official_name": "DataBank, Bodleian Libraries, University of Oxford"
+ },
+ "BL.STANDREW": {
+ "openaire_id": "re3data_____::r3d100012411",
+ "datacite_name": "University of St Andrews",
+ "official_name": "St Andrews Research portal - Research Data"
+ },
+ "TIB.BAFG": {
+ "openaire_id": "re3data_____::r3d100011664",
+ "datacite_name": "Bundesanstalt f\u00fcr Gew\u00e4sserkunde",
+ "official_name": "Geoportal der BFG"
+ },
+ "CRUI.UNIBO": {
+ "openaire_id": "re3data_____::r3d100012604",
+ "datacite_name": "Universit\u00e0 degli Studi di Bologna",
+ "official_name": "AMS Acta"
+ },
+ "GDCC.ODUM-LIBRARY": {
+ "openaire_id": "re3data_____::r3d100000005",
+ "datacite_name": "UNC Libraries",
+ "official_name": "UNC Dataverse"
+ },
+ "RG.RG": {
+ "openaire_id": "re3data_____::r3d100012227",
+ "datacite_name": "ResearchGate",
+ "official_name": "ResearchGate"
+ },
+ "TIB.EUMETSAT": {
+ "openaire_id": "re3data_____::r3d100010232",
+ "datacite_name": "EUMETSAT",
+ "official_name": "Eumetsat"
+ },
+ "SND.SMHI": {
+ "openaire_id": "re3data_____::r3d100011776",
+ "datacite_name": "Swedish Meteorological and Hydrological Institute open data",
+ "official_name": "Swedish Meteorological and Hydrological Institute open data"
+ },
+ "NOAA.NCEI": {
+ "openaire_id": "re3data_____::r3d100011801",
+ "datacite_name": "National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI)",
+ "official_name": "NCEI"
+ },
+ "TIB.WDCC": {
+ "openaire_id": "re3data_____::r3d100010299",
+ "datacite_name": "World Data Center for Climate",
+ "official_name": "World Data Center for Climate"
+ },
+ "CNGB.GIGADB": {
+ "openaire_id": "re3data_____::r3d100010478",
+ "datacite_name": "GigaDB",
+ "official_name": "GigaDB"
+ },
+ "DELFT.VLIZ": {
+ "openaire_id": "re3data_____::r3d100010661",
+ "datacite_name": "Vlaams Instituut voor de Zee",
+ "official_name": "Flanders Marine Institute"
+ },
+ "NUS.SB": {
+ "openaire_id": "re3data_____::r3d100012564",
+ "datacite_name": "National University of Singapore",
+ "official_name": "ScholarBank@NUS"
+ },
+ "EDI.EDI": {
+ "openaire_id": "re3data_____::r3d100010272",
+ "datacite_name": "Environmental Data Initiative",
+ "official_name": "Environmental Data Initiative Repository"
+ },
+ "INIST.ADISP": {
+ "openaire_id": "re3data_____::r3d100010494",
+ "datacite_name": "Quetelet PROGEDO Diffusion",
+ "official_name": "Quetelet PROGEDO Diffusion"
+ },
+ "GESIS.SHARE": {
+ "openaire_id": "re3data_____::r3d100010430",
+ "datacite_name": "SHARE - ERIC",
+ "official_name": "Survey of Health, Ageing and Retirement in Europe"
+ },
+ "ANDS.CENTRE-1": {
+ "openaire_id": "re3data_____::r3d100010864",
+ "datacite_name": "Griffith University",
+ "official_name": "Griffith University Research Data Repository"
+ },
+ "BL.READING": {
+ "openaire_id": "re3data_____::r3d100012064",
+ "datacite_name": "University of Reading",
+ "official_name": "University of Reading Research Data Archive"
+ },
+ "CORNELL.CISER": {
+ "openaire_id": "re3data_____::r3d100011056",
+ "datacite_name": "CISER Data Archive",
+ "official_name": "CISER Data Archive"
+ },
+ "DRYAD.DRYAD": {
+ "openaire_id": "re3data_____::r3d100000044",
+ "datacite_name": "DRYAD",
+ "official_name": "DRYAD"
+ },
+ "CDL.PISCO": {
+ "openaire_id": "re3data_____::r3d100010947",
+ "datacite_name": "Partnership for Interdisciplinary Studies of Coastal Oceans (PISCO)",
+ "official_name": "Partnership for Interdisciplinary Studies of Coastal Oceans"
+ },
+ "IEEE.DATAPORT": {
+ "openaire_id": "re3data_____::r3d100012569",
+ "datacite_name": "IEEE DataPort",
+ "official_name": "IEEE DataPort"
+ },
+ "DELFT.MAASTRO": {
+ "openaire_id": "re3data_____::r3d100011086",
+ "datacite_name": "MAASTRO Clinic",
+ "official_name": "CancerData.org"
+ },
+ "USGS.PROD": {
+ "openaire_id": "re3data_____::r3d100010054",
+ "datacite_name": "USGS DOI Tool Production Environment",
+ "official_name": "U.S. Geological Survey"
+ },
+ "GDCC.ODUM-DV": {
+ "openaire_id": "re3data_____::r3d100000005",
+ "datacite_name": "Odum Institute Dataverse",
+ "official_name": "UNC Dataverse"
+ },
+ "CDL.SDSCSG": {
+ "openaire_id": "re3data_____::r3d100011690",
+ "datacite_name": "UCSD Signaling Gateway",
+ "official_name": "UCSD Signaling gateway"
+ },
+ "ORBIS.NKN": {
+ "openaire_id": "re3data_____::r3d100011587",
+ "datacite_name": "Northwest Knowledge Network",
+ "official_name": "Northwest Knowledge Network"
+ },
+ "ANDS.CENTRE63": {
+ "openaire_id": "re3data_____::r3d100010918",
+ "datacite_name": "Test: Atlas of Living Australia",
+ "official_name": "Atlas of Living Australia",
+ "similarity": 0.8928571428571429
+ },
+ "SML.TALKBANK": {
+ "openaire_id": "re3data_____::r3d100010887",
+ "datacite_name": "TalkBank",
+ "official_name": "TalkBank"
+ },
+ "CORNELL.LIBRARY": {
+ "openaire_id": "re3data_____::r3d100012322",
+ "datacite_name": "Cornell University Library",
+ "official_name": "eCommons - Cornell's digital repository"
+ },
+ "BL.SOTON": {
+ "openaire_id": "re3data_____::r3d100011245",
+ "datacite_name": "University of Southampton",
+ "official_name": "University of Southampton Institutional Research Repository"
+ },
+ "GESIS.DB-BANK": {
+ "openaire_id": "re3data_____::r3d100012252",
+ "datacite_name": "Forschungsdaten- und Servicezentrum der Bundesbank",
+ "official_name": "Forschungsdaten- und Servicezentrum der Bundesbank"
+ },
+ "ANDS.CENTRE68": {
+ "openaire_id": "re3data_____::r3d100010918",
+ "datacite_name": "Atlas of Living Australia",
+ "official_name": "Atlas of Living Australia"
+ },
+ "ANDS.CENTRE69": {
+ "openaire_id": "re3data_____::r3d100010914",
+ "datacite_name": "Australian Ocean Data Network",
+ "official_name": "Australian Ocean Data Network Portal"
+ },
+ "INIST.CDS": {
+ "openaire_id": "re3data_____::r3d100010584",
+ "datacite_name": "Strasbourg Astronomical Data Center",
+ "official_name": "Strasbourg Astronomical Data Center"
+ },
+ "BL.NHM": {
+ "openaire_id": "re3data_____::r3d100011675",
+ "datacite_name": "Natural History Museum, London",
+ "official_name": "Natural History Museum, Data Portal"
+ },
+ "BL.ADS": {
+ "openaire_id": "re3data_____::r3d100000006",
+ "datacite_name": "Archaeology Data Service",
+ "official_name": "Archaeology Data Service"
+ },
+ "GDCC.JHU": {
+ "openaire_id": "re3data_____::r3d100011836",
+ "datacite_name": "Johns Hopkins University Library",
+ "official_name": "Johns Hopkins Data Archive Dataverse Network"
+ },
+ "BL.ED": {
+ "openaire_id": "re3data_____::r3d100000047",
+ "datacite_name": "University of Edinburgh",
+ "official_name": "Edinburgh DataShare"
+ },
+ "BL.EXETER": {
+ "openaire_id": "re3data_____::r3d100011202",
+ "datacite_name": "University of Exeter",
+ "official_name": "Open Research Exeter"
+ },
+ "BL.NCL": {
+ "openaire_id": "re3data_____::r3d100012408",
+ "datacite_name": "Newcastle University",
+ "official_name": "NCL Data"
+ },
+ "BROWN.BDR": {
+ "openaire_id": "re3data_____::r3d100011654",
+ "datacite_name": "Brown Digital Repository",
+ "official_name": "Brown Digital Repository"
+ },
+ "GDCC.SYR-QDR": {
+ "openaire_id": "re3data_____::r3d100011038",
+ "datacite_name": "Syracuse University Qualitative Data Repository",
+ "official_name": "Qualitative Data Repository"
+ },
+ "BL.BRISTOL": {
+ "openaire_id": "re3data_____::r3d100011099",
+ "datacite_name": "University of Bristol",
+ "official_name": "data.bris Research Data Repository"
+ },
+ "DATACITE.DATACITE": {
+ "openaire_id": "openaire____::datacite",
+ "datacite_name": "DataCite",
+ "official_name": "Datacite"
+ },
+ "ESTDOI.KEEL": {
+ "openaire_id": "re3data_____::r3d100011941",
+ "datacite_name": "Keeleressursid. The Center of Estonian Language Resources",
+ "official_name": "Center of Estonian Language Resources"
+ },
+ "BL.ESSEX": {
+ "openaire_id": "re3data_____::r3d100012405",
+ "datacite_name": "University of Essex",
+ "official_name": "Research Data at Essex"
+ },
+ "PURDUE.MDF": {
+ "openaire_id": "re3data_____::r3d100012080",
+ "datacite_name": "Univ Chicago Materials Data Facility",
+ "official_name": "Materials Data Facility"
+ },
+ "DELFT.KNMI": {
+ "openaire_id": "re3data_____::r3d100011879",
+ "datacite_name": "KNMI Data Centre",
+ "official_name": "KNMI Data Centre"
+ },
+ "CUL.CIESIN": {
+ "openaire_id": "re3data_____::r3d100010207",
+ "datacite_name": "Center for International Earth Science Information Network",
+ "official_name": "Center for International Earth Science Information Network"
+ },
+ "WISC.NEOTOMA": {
+ "openaire_id": "re3data_____::r3d100011761",
+ "datacite_name": "Neotoma Paleoecological Database",
+ "official_name": "Neotoma Paleoecology Database",
+ "similarity": 0.9180327868852459
+ },
+ "IRIS.IRIS": {
+ "openaire_id": "re3data_____::r3d100010268",
+ "datacite_name": "Incorporated Research Institutions for Seismology",
+ "official_name": "Incorporated Research Institutions for Seismology"
+ },
+ "ANDS.CENTRE50": {
+ "openaire_id": "re3data_____::r3d100012378",
+ "datacite_name": "Analysis and Policy Observatory",
+ "official_name": "Analysis and Policy Observatory"
+ },
+ "FAO.RING": {
+ "openaire_id": "re3data_____::r3d100012571",
+ "datacite_name": "CIARD RING",
+ "official_name": "CIARD Ring"
+ },
+ "CUL.R2R": {
+ "openaire_id": "re3data_____::r3d100010735",
+ "datacite_name": "Rolling Deck to Repository",
+ "official_name": "Rolling Deck to Repository"
+ },
+ "DEMO.GRIIDC": {
+ "openaire_id": "re3data_____::r3d100011571",
+ "datacite_name": "Gulf of Mexico Research Initiative Information and Data Cooperative",
+ "official_name": "Gulf of Mexico Research Initiative Information and Data Cooperative"
+ },
+ "ANDS.CENTRE-6": {
+ "openaire_id": "re3data_____::r3d100012268",
+ "datacite_name": "Curtin University",
+ "official_name": "Curtin University Research Data Collection"
+ },
+ "ANDS.CENTRE-5": {
+ "openaire_id": "re3data_____::r3d100012013",
+ "datacite_name": "TERN Central Portal",
+ "official_name": "TERN Data Discovery portal"
+ },
+ "FIGSHARE.UCT": {
+ "openaire_id": "re3data_____::r3d100012633",
+ "datacite_name": "University of Cape Town (UCT)",
+ "official_name": "ZivaHub"
+ },
+ "BIBSYS.UIT-ORD": {
+ "openaire_id": "re3data_____::r3d100012538",
+ "datacite_name": "DataverseNO",
+ "official_name": "DataverseNO"
+ },
+ "CISTI.CADC": {
+ "openaire_id": "re3data_____::r3d100000016",
+ "datacite_name": "Canadian Astronomy Data Centre",
+ "official_name": "The Canadian Astronomy Data Centre",
+ "similarity": 0.9375
+ },
+ "BL.CCDC": {
+ "openaire_id": "re3data_____::r3d100010197",
+ "datacite_name": "The Cambridge Crystallographic Data Centre",
+ "official_name": "The Cambridge Structural Database"
+ },
+ "BL.UCLD": {
+ "openaire_id": "re3data_____::r3d100012417",
+ "datacite_name": "University College London",
+ "official_name": "UCL Discovery"
+ },
+ "GESIS.RKI": {
+ "openaire_id": "re3data_____::r3d100010436",
+ "datacite_name": "'Health Monitoring' Research Data Centre at the Robert Koch Institute",
+ "official_name": "'Health Monitoring' Research Data Centre at the Robert Koch Institute"
+ },
+ "BL.DRI": {
+ "openaire_id": "re3data_____::r3d100011805",
+ "datacite_name": "Digital Repository of Ireland",
+ "official_name": "Digital Repository of Ireland"
+ },
+ "TIB.KIT-IMK": {
+ "openaire_id": "re3data_____::r3d100011956",
+ "datacite_name": "Institute for Meteorology and Climate Research - Atmospheric Trace Gases and Remote Sensing",
+ "official_name": "CARIBIC"
+ },
+ "DOINZ.LANDCARE": {
+ "openaire_id": "re3data_____::r3d100011662",
+ "datacite_name": "Landcare Research New Zealand Ltd",
+ "official_name": "Landcare Research Data Repository"
+ },
+ "DEMO.EMORY": {
+ "openaire_id": "re3data_____::r3d100011559",
+ "datacite_name": "The Cancer Imaging Archive",
+ "official_name": "The Cancer Imaging Archive"
+ },
+ "UMN.DRUM": {
+ "openaire_id": "re3data_____::r3d100011393",
+ "datacite_name": "Data Repository for the University of Minnesota",
+ "official_name": "Data Repository for the University of Minnesota"
+ },
+ "CISTI.SFU": {
+ "openaire_id": "re3data_____::r3d100012512",
+ "datacite_name": "Simon Fraser University",
+ "official_name": "SFU Radar"
+ },
+ "GESIS.ICPSR": {
+ "openaire_id": "re3data_____::r3d100010255",
+ "datacite_name": "ICPSR",
+ "official_name": "Inter-university Consortium for Political and Social Research"
+ },
+ "ANDS.CENTRE49": {
+ "openaire_id": "re3data_____::r3d100012145",
+ "datacite_name": "The University of Melbourne",
+ "official_name": "melbourne.figshare.com"
+ },
+ "ZBW.IFO": {
+ "openaire_id": "re3data_____::r3d100010201",
+ "datacite_name": "LMU-ifo Economics & Business Data Center",
+ "official_name": "LMU-ifo Economics & Business Data Center"
+ },
+ "TIB.BEILST": {
+ "openaire_id": "re3data_____::r3d100012329",
+ "datacite_name": "Beilstein-Institut zur F\u00f6rderung der Chemischen Wissenschaften",
+ "official_name": "STRENDA DB"
+ },
+ "ZBW.ZBW-JDA": {
+ "openaire_id": "re3data_____::r3d100012190",
+ "datacite_name": "ZBW Journal Data Archive",
+ "official_name": "ZBW Journal Data Archive"
+ },
+ "BL.UKDA": {
+ "openaire_id": "re3data_____::r3d100010215",
+ "datacite_name": "UK Data Archive",
+ "official_name": "UK Data Archive"
+ },
+ "CERN.INSPIRE": {
+ "openaire_id": "re3data_____::r3d100011077",
+ "datacite_name": "inspirehep.net",
+ "official_name": "Inspire-HEP"
+ },
+ "CISTI.OTNDC": {
+ "openaire_id": "re3data_____::r3d100012083",
+ "datacite_name": "Ocean Tracking Network",
+ "official_name": "Ocean Tracking Network"
+ },
+ "CISTI.CC": {
+ "openaire_id": "re3data_____::r3d100012646",
+ "datacite_name": "Compute Canada",
+ "official_name": "Federated Research Data Repository"
+ },
+ "SND.ICOS": {
+ "openaire_id": "re3data_____::r3d100012203",
+ "datacite_name": "ICOS Carbon Portal",
+ "official_name": "ICOS Carbon Portal"
+ },
+ "BL.MENDELEY": {
+ "openaire_id": "re3data_____::r3d100011868",
+ "datacite_name": "Mendeley",
+ "official_name": "Mendeley Data"
+ },
+ "DELFT.UU": {
+ "openaire_id": "re3data_____::r3d100011201",
+ "datacite_name": "Universiteit Utrecht",
+ "official_name": "DataverseNL"
+ },
+ "GESIS.DSZ-BO": {
+ "openaire_id": "re3data_____::r3d100010439",
+ "datacite_name": "Data Service Center for Business and Organizational Data",
+ "official_name": "Data Service Center for Business and Organizational Data"
+ },
+ "TIB.IPK": {
+ "openaire_id": "re3data_____::r3d100011647",
+ "datacite_name": "IPK Gatersleben",
+ "official_name": "IPK Gatersleben"
+ },
+ "GDCC.HARVARD-DV": {
+ "openaire_id": "re3data_____::r3d100010051",
+ "datacite_name": "Harvard IQSS Dataverse",
+ "official_name": "Harvard Dataverse"
+ },
+ "BL.LEEDS": {
+ "openaire_id": "re3data_____::r3d100011945",
+ "datacite_name": "University of Leeds",
+ "official_name": "Research Data Leeds Repository"
+ },
+ "BL.BRUNEL": {
+ "openaire_id": "re3data_____::r3d100012140",
+ "datacite_name": "Brunel University London",
+ "official_name": "Brunel figshare"
+ },
+ "DEMO.ENVIDAT": {
+ "openaire_id": "re3data_____::r3d100012587",
+ "datacite_name": "EnviDat",
+ "official_name": "EnviDat"
+ },
+ "GDCC.NTU": {
+ "openaire_id": "re3data_____::r3d100012440",
+ "datacite_name": "Nanyang Technological University",
+ "official_name": "DR-NTU (Data)"
+ },
+ "UNM.DATAONE": {
+ "openaire_id": "re3data_____::r3d100000045",
+ "datacite_name": "DataONE",
+ "official_name": "DataONE"
+ },
+ "CSC.NRD": {
+ "openaire_id": "re3data_____::r3d100012157",
+ "datacite_name": "Ministry of Culture and Education",
+ "official_name": "IDA Research Data Storage Service"
+ },
+ "GESIS.DIPF": {
+ "openaire_id": "re3data_____::r3d100010390",
+ "datacite_name": "Research Data Centre for Education",
+ "official_name": "Research Data Centre for Education"
+ },
+ "BL.HALLAM": {
+ "openaire_id": "re3data_____::r3d100011909",
+ "datacite_name": "Sheffield Hallam University",
+ "official_name": "Sheffield Hallam University Research Data Archive"
+ },
+ "BL.LSHTM": {
+ "openaire_id": "re3data_____::r3d100011800",
+ "datacite_name": "London School of Hygiene and Tropical Medicine",
+ "official_name": "LSHTM Data Compass"
+ },
+ "SUBGOE.DARIAH": {
+ "openaire_id": "re3data_____::r3d100011345",
+ "datacite_name": "Digital Research Infrastructure for the Arts and Humanities",
+ "official_name": "DARIAH-DE Repository"
+ },
+ "SND.SU": {
+ "openaire_id": "re3data_____::r3d100012147",
+ "datacite_name": "Stockholm University",
+ "official_name": "Stockholm University repository for data"
+ },
+ "GESIS.INDEPTH": {
+ "openaire_id": "re3data_____::r3d100011392",
+ "datacite_name": "INDEPTH Network",
+ "official_name": "INDEPTH Data Repository"
+ },
+ "TIB.FLOSS": {
+ "openaire_id": "re3data_____::r3d100010863",
+ "datacite_name": "FLOSS Project, Syracuse University",
+ "official_name": "FLOSSmole"
+ },
+ "ETHZ.WGMS": {
+ "openaire_id": "re3data_____::r3d100010627",
+ "datacite_name": "World Glacier Monitoring Service",
+ "official_name": "World Glacier Monitoring Service"
+ },
+ "BL.UEL": {
+ "openaire_id": "re3data_____::r3d100012414",
+ "datacite_name": "University of East London",
+ "official_name": "Data.uel"
+ },
+ "DELFT.DATA4TU": {
+ "openaire_id": "re3data_____::r3d100010216",
+ "datacite_name": "4TU.Centre for Research Data",
+ "official_name": "4TU.Centre for Research Data"
+ },
+ "GESIS.IANUS": {
+ "openaire_id": "re3data_____::r3d100012361",
+ "datacite_name": "IANUS - FDZ Arch\u00e4ologie & Altertumswissenschaften",
+ "official_name": "IANUS Datenportal"
+ },
+ "CDL.UCSDCCA": {
+ "openaire_id": "re3data_____::r3d100011655",
+ "datacite_name": "California Coastal Atlas",
+ "official_name": "California Coastal Atlas"
+ },
+ "VIVA.VT": {
+ "openaire_id": "re3data_____::r3d100012601",
+ "datacite_name": "Virginia Tech",
+ "official_name": "VTechData"
+ },
+ "ANDS.CENTRE39": {
+ "openaire_id": "re3data_____::r3d100011640",
+ "datacite_name": "University of the Sunshine Coast",
+ "official_name": "USC Research Bank research data"
+ },
+ "DEMO.OPENKIM": {
+ "openaire_id": "re3data_____::r3d100011864",
+ "datacite_name": "OpenKIM",
+ "official_name": "OpenKIM"
+ },
+ "INIST.OTELO": {
+ "openaire_id": "re3data_____::r3d100012505",
+ "datacite_name": "Observatoire Terre Environnement de Lorraine",
+ "official_name": "ORDaR"
+ },
+ "INIST.ILL": {
+ "openaire_id": "re3data_____::r3d100012072",
+ "datacite_name": "Institut Laue-Langevin",
+ "official_name": "ILL Data Portal"
+ },
+ "ANDS.CENTRE31": {
+ "openaire_id": "re3data_____::r3d100012378",
+ "datacite_name": "Test: Analysis and Policy Observatory",
+ "official_name": "Analysis and Policy Observatory",
+ "similarity": 0.9117647058823529
+ },
+ "ANDS.CENTRE30": {
+ "openaire_id": "re3data_____::r3d100010917",
+ "datacite_name": "Test: Geoscience Australia",
+ "official_name": "Geoscience Australia",
+ "similarity": 0.8695652173913043
+ },
+ "BL.SALFORD": {
+ "openaire_id": "re3data_____::r3d100012144",
+ "datacite_name": "University of Salford",
+ "official_name": "University of Salford Data Repository"
+ },
+ "CERN.HEPDATA": {
+ "openaire_id": "re3data_____::r3d100010081",
+ "datacite_name": "HEPData.net",
+ "official_name": "HEPData"
+ },
+ "ETHZ.E-COLL": {
+ "openaire_id": "re3data_____::r3d100012557",
+ "datacite_name": "ETH Z\u00fcrich Research Collection",
+ "official_name": "ETH Z\u00fcrich Research Collection"
+ },
+ "GBIF.GBIF": {
+ "openaire_id": "re3data_____::r3d100000039",
+ "datacite_name": "Global Biodiversity Information Facility",
+ "official_name": "Global Biodiversity Information Facility"
+ },
+ "ORNLDAAC.DAAC": {
+ "openaire_id": "re3data_____::r3d100000037",
+ "datacite_name": "Oak Ridge National Laboratory Distributed Active Archive Center",
+ "official_name": "Oak Ridge National Laboratory Distributed Active Archive Center for Biogeochemical Dynamics"
+ },
+ "KAUST.KAUSTREPO": {
+ "openaire_id": "re3data_____::r3d100011898",
+ "datacite_name": "KAUST Research Repository",
+ "official_name": "UWA Research Repository",
+ "similarity": 0.875
+ },
+ "ZBW.ZEW": {
+ "openaire_id": "re3data_____::r3d100010399",
+ "datacite_name": "Zentrum f\u00fcr Europ\u00e4ische Wirtschaftsforschung GmbH (ZEW)",
+ "official_name": "ZEW Forschungsdatenzentrum"
+ },
+ "SML.TDAR": {
+ "openaire_id": "re3data_____::r3d100010347",
+ "datacite_name": "Digital Antiquity (TDAR)",
+ "official_name": "tDAR"
+ },
+ "GESIS.CSDA": {
+ "openaire_id": "re3data_____::r3d100010484",
+ "datacite_name": "Czech Social Science Data Archive",
+ "official_name": "Czech Social Science Data Archive"
+ },
+ "SND.BOLIN": {
+ "openaire_id": "re3data_____::r3d100011699",
+ "datacite_name": "Bolin Centre Database",
+ "official_name": "Bolin Centre Database"
+ },
+ "MLA.HC": {
+ "openaire_id": "re3data_____::r3d100012309",
+ "datacite_name": "Humanities Commons",
+ "official_name": "Humanities Commons"
+ },
+ "CDL.IDASHREP": {
+ "openaire_id": "re3data_____::r3d100010382",
+ "datacite_name": "iDASH Repository",
+ "official_name": "IDS Repository",
+ "similarity": 0.8666666666666667
+ },
+ "ZBMED.SNSB": {
+ "openaire_id": "re3data_____::r3d100011873",
+ "datacite_name": "Staatliche Naturwissenschaftliche Sammlungen Bayerns",
+ "official_name": "Staatliche Naturwissenschaftliche Sammlungen Bayerns - datasets",
+ "similarity": 0.9043478260869565
+ },
+ "ORBIS.OHSU": {
+ "openaire_id": "re3data_____::r3d100012244",
+ "datacite_name": "Oregon Health Sciences University",
+ "official_name": "OHSU Digital Commons"
+ },
+ "DARTLIB.CRAWDAD": {
+ "openaire_id": "re3data_____::r3d100010716",
+ "datacite_name": "CRAWDAD",
+ "official_name": "CRAWDAD"
+ },
+ "CDL.CCHDO": {
+ "openaire_id": "re3data_____::r3d100010831",
+ "datacite_name": "CLIVAR and Carbon Hydrographic Data Office",
+ "official_name": "Climate Variability and Predictability and Carbon Hydrographic Data Office"
+ },
+ "GESIS.AUSSDA": {
+ "openaire_id": "re3data_____::r3d100010483",
+ "datacite_name": "Austrian Social Science Data Archive",
+ "official_name": "AUSSDA"
+ },
+ "NSIDC.DATACTR": {
+ "openaire_id": "re3data_____::r3d100010110",
+ "datacite_name": "National Snow and Ice Data Center",
+ "official_name": "National Snow and Ice Data Center"
+ },
+ "TIB.RADAR": {
+ "openaire_id": "re3data_____::r3d100012330",
+ "datacite_name": "FIZ Karlsruhe \u2013 Leibniz-Institut f\u00fcr Informationsinfrastruktur",
+ "official_name": "RADAR"
+ },
+ "KIM.OPENKIM": {
+ "openaire_id": "re3data_____::r3d100011864",
+ "datacite_name": "Open Knowledgebase of Interatomic Models (OpenKIM)",
+ "official_name": "OpenKIM"
+ },
+ "BL.LBORO": {
+ "openaire_id": "re3data_____::r3d100012143",
+ "datacite_name": "Loughborough University",
+ "official_name": "Loughborough Data Repository"
+ },
+ "GESIS.ZPID": {
+ "openaire_id": "re3data_____::r3d100010328",
+ "datacite_name": "GESIS.ZPID",
+ "official_name": "PsychData"
+ },
+ "SML.TCIA": {
+ "openaire_id": "re3data_____::r3d100011559",
+ "datacite_name": "The Cancer Imaging Archive",
+ "official_name": "The Cancer Imaging Archive"
+ },
+ "CDL.IRIS": {
+ "openaire_id": "re3data_____::r3d100010268",
+ "datacite_name": "Incorporated Research Institutions for Seismology",
+ "official_name": "Incorporated Research Institutions for Seismology"
+ },
+ "BIBSYS.NMDC": {
+ "openaire_id": "re3data_____::r3d100012291",
+ "datacite_name": "Norwegian Marine Data Centre",
+ "official_name": "Norwegian Polar Data Centre",
+ "similarity": 0.8727272727272727
+ },
+ "ANDS.CENTRE25": {
+ "openaire_id": "re3data_____::r3d100010917",
+ "datacite_name": "Geoscience Australia",
+ "official_name": "Geoscience Australia"
+ },
+ "BL.UCLAN": {
+ "openaire_id": "re3data_____::r3d100012019",
+ "datacite_name": "University of Central Lancashire",
+ "official_name": "UCLanData"
+ },
+ "ANDS.CENTRE23": {
+ "openaire_id": "re3data_____::r3d100011898",
+ "datacite_name": "The University of Western Australia",
+ "official_name": "UWA Research Repository"
+ },
+ "CISTI.WOUDC": {
+ "openaire_id": "re3data_____::r3d100010367",
+ "datacite_name": "World Ozone and Ultraviolet Radiation Data Centre",
+ "official_name": "World Ozone and Ultraviolet Radiation Data Centre"
+ },
+ "FIGSHARE.ARS": {
+ "openaire_id": "re3data_____::r3d100010066",
+ "datacite_name": "figshare Academic Research System",
+ "official_name": "figshare"
+ },
+ "ILLINOIS.DATABANK": {
+ "openaire_id": "re3data_____::r3d100012001",
+ "datacite_name": "Illinois Data Bank",
+ "official_name": "Illinois Data Bank"
+ },
+ "BL.ECMWF": {
+ "openaire_id": "re3data_____::r3d100011726",
+ "datacite_name": "European Centre for Medium-Range Weather Forecasts",
+ "official_name": "European Centre for Medium-Range Weather Forecasts"
+ },
+ "CDL.ISSDA": {
+ "openaire_id": "re3data_____::r3d100010497",
+ "datacite_name": "Irish Social Science Data Archive (ISSDA)",
+ "official_name": "Irish Social Science Data Archive"
+ },
+ "CDL.PQR": {
+ "openaire_id": "re3data_____::r3d100012225",
+ "datacite_name": "Pitt Quantum Repository",
+ "official_name": "Pitt Quantum Repository"
+ },
+ "ANDS.CENTRE82": {
+ "openaire_id": "re3data_____::r3d100010138",
+ "datacite_name": "Test: Australian Data Archive",
+ "official_name": "Australian Data Archive",
+ "similarity": 0.8846153846153846
+ },
+ "GDCC.HARVARD-SLP": {
+ "openaire_id": "re3data_____::r3d100011861",
+ "datacite_name": "National Sleep Research Resource",
+ "official_name": "National Sleep Research Resource"
+ },
+ "CDL.IMMPORT": {
+ "openaire_id": "re3data_____::r3d100012529",
+ "datacite_name": "UCSF ImmPort",
+ "official_name": "ImmPort"
+ },
+ "GESIS.FID": {
+ "openaire_id": "re3data_____::r3d100012347",
+ "datacite_name": "FID f\u00fcr internationale und interdisziplin\u00e4re Rechtsforschung",
+ "official_name": "\u00b2Dok[\u00a7]"
+ },
+ "OCEAN.OCEAN": {
+ "openaire_id": "re3data_____::r3d100012369",
+ "datacite_name": "Code Ocean",
+ "official_name": "Code Ocean"
+ },
+ "CERN.ZENODO": {
+ "openaire_id": "re3data_____::r3d100010468",
+ "datacite_name": "Zenodo",
+ "official_name": "Zenodo"
+ },
+ "ETHZ.DA-RD": {
+ "openaire_id": "re3data_____::r3d100011626",
+ "datacite_name": "ETHZ Data Archive - Research Data",
+ "official_name": "ETH Data Archive"
+ },
+ "SND.ECDS": {
+ "openaire_id": "re3data_____::r3d100011000",
+ "datacite_name": "Environment Climate Data Sweden",
+ "official_name": "Environment Climate Data Sweden"
+ },
+ "BL.BATH": {
+ "openaire_id": "re3data_____::r3d100011947",
+ "datacite_name": "University of Bath",
+ "official_name": "University of Bath Research Data Archive"
+ },
+ "TIB.LDEO": {
+ "openaire_id": "re3data_____::r3d100012547",
+ "datacite_name": "LDEO - Lamont-Doherty Earth Observatory, Columbia University",
+ "official_name": "Lamont-Doherty Core Repository"
+ },
+ "COS.OSF": {
+ "openaire_id": "re3data_____::r3d100011137",
+ "datacite_name": "Open Science Framework",
+ "official_name": "Open Science Framework"
+ },
+ "ESTDOI.REPO": {
+ "openaire_id": "re3data_____::r3d100012333",
+ "datacite_name": "DataDOI",
+ "official_name": "DataDOI"
+ },
+ "CDL.NSFADC": {
+ "openaire_id": "re3data_____::r3d100011973",
+ "datacite_name": "NSF Arctic Data Center",
+ "official_name": "NSF Arctic Data Center"
+ },
+ "ANDS.CENTRE13": {
+ "openaire_id": "re3data_____::r3d100010477",
+ "datacite_name": "The Australian National University",
+ "official_name": "ANU Data Commons"
+ },
+ "BL.NERC": {
+ "openaire_id": "re3data_____::r3d100010199",
+ "datacite_name": "Natural Environment Research Council",
+ "official_name": "Environmental Information Data Centre"
+ },
+ "SAGEBIO.SYNAPSE": {
+ "openaire_id": "re3data_____::r3d100011894",
+ "datacite_name": "Synapse",
+ "official_name": "Synapse"
+ },
+ "ANDS.CENTRE15": {
+ "openaire_id": "re3data_____::r3d100000038",
+ "datacite_name": "Australian Antarctic Division",
+ "official_name": "Australian Antarctic Data Centre"
+ },
+ "WISC.BMRB": {
+ "openaire_id": "re3data_____::r3d100010191",
+ "datacite_name": "Biological Magnetic Resonance Bank",
+ "official_name": "Biological Magnetic Resonance Data Bank",
+ "similarity": 0.9315068493150684
+ },
+ "STSCI.MAST": {
+ "openaire_id": "re3data_____::r3d100010403",
+ "datacite_name": "Barbara A. Mikulski Archive for Space Telescopes",
+ "official_name": "Barbara A. Mikulski Archive for Space Telescopes"
+ },
+ "CDL.NSIDC": {
+ "openaire_id": "re3data_____::r3d100010110",
+ "datacite_name": "National Snow and Ice Data Center",
+ "official_name": "National Snow and Ice Data Center"
+ },
+ "BL.STRATH": {
+ "openaire_id": "re3data_____::r3d100012412",
+ "datacite_name": "University of Strathclyde",
+ "official_name": "University of Strathclyde KnowledgeBase Datasets"
+ },
+ "DEMO.TDAR": {
+ "openaire_id": "re3data_____::r3d100010347",
+ "datacite_name": "The Digital Archaeological Record (tDAR)",
+ "official_name": "tDAR"
+ },
+ "TIND.CALTECH": {
+ "openaire_id": "re3data_____::r3d100012384",
+ "datacite_name": "CaltechDATA",
+ "official_name": "CaltechDATA"
+ },
+ "GESIS.BIBB-FDZ": {
+ "openaire_id": "re3data_____::r3d100010190",
+ "datacite_name": "Forschungsdatenzentrum im Bundesinstitut f\u00fcr Berufsbildung",
+ "official_name": "Forschungsdatenzentrum im Bundesinstitut f\u00fcr Berufsbildung"
+ },
+ "ANDS.CENTRE87": {
+ "openaire_id": "re3data_____::r3d100010138",
+ "datacite_name": "Australian Data Archive",
+ "official_name": "Australian Data Archive"
+ },
+ "GESIS.NEPS": {
+ "openaire_id": "re3data_____::r3d100010736",
+ "datacite_name": "Nationales Bildungspanel (National Educational Panel Study, NEPS)",
+ "official_name": "Nationales Bildungspanel"
+ },
+ "CDL.UCBCRCNS": {
+ "openaire_id": "re3data_____::r3d100011269",
+ "datacite_name": "Collaborative Research in Computational Neuroscience (CRCNS)",
+ "official_name": "Collaborative Research in Computational Neuroscience"
+ },
+ "TIB.UKON": {
+ "openaire_id": "re3data_____::r3d100010469",
+ "datacite_name": "Movebank",
+ "official_name": "Movebank"
+ },
+ "UMN.IPUMS": {
+ "openaire_id": "re3data_____::r3d100010794",
+ "datacite_name": "Minnesota Population Center",
+ "official_name": "Minnesota Population Center"
+ },
+ "TIB.BIKF": {
+ "openaire_id": "re3data_____::r3d100012379",
+ "datacite_name": "Senckenberg Data & Metadata Repository",
+ "official_name": "Senckenberg Data & Metadata Repository"
+ },
+ "TDL.GRIIDC": {
+ "openaire_id": "re3data_____::r3d100011571",
+ "datacite_name": "Gulf of Mexico Research Initiative Information and Data Cooperative",
+ "official_name": "Gulf of Mexico Research Initiative Information and Data Cooperative"
+ },
+ "DELFT.NIBG": {
+ "openaire_id": "re3data_____::r3d100012167",
+ "datacite_name": "Sound and Vision",
+ "official_name": "Sound and Vision"
+ },
+ "BL.SURREY": {
+ "openaire_id": "re3data_____::r3d100012232",
+ "datacite_name": "University of Surrey",
+ "official_name": "Surrey Research Insight"
+ },
+ "OSTI.ORNLNGEE": {
+ "openaire_id": "re3data_____::r3d100011676",
+ "datacite_name": "NGEE-Arctic (Next Generation Ecosystems Experiement)",
+ "official_name": "NGEE Arctic"
+ },
+ "TIB.WDCRSAT": {
+ "openaire_id": "re3data_____::r3d100010156",
+ "datacite_name": "World Data Center for Remote Sensing of the Atmosphere",
+ "official_name": "The World Data Center for Remote Sensing of the Atmosphere",
+ "similarity": 0.9642857142857143
+ },
+ "ZBMED.DSMZ": {
+ "openaire_id": "re3data_____::r3d100010219",
+ "datacite_name": "DSMZ",
+ "official_name": "DSMZ"
+ },
+ "DOINZ.NZAU": {
+ "openaire_id": "re3data_____::r3d100012110",
+ "datacite_name": "University of Auckland Data Publishing and Discovery Service",
+ "official_name": "University of Auckland Data Repository"
+ },
+ "INIST.RESIF": {
+ "openaire_id": "re3data_____::r3d100012222",
+ "datacite_name": "R\u00e9seau sismologique et g\u00e9od\u00e9sique fran\u00e7ais",
+ "official_name": "RESIF Seismic Data Portal"
+ },
+ "CDL.NCEAS": {
+ "openaire_id": "re3data_____::r3d100010093",
+ "datacite_name": "National Center for Ecological Analysis and Synthesis (NCEAS)",
+ "official_name": "National Center for Ecological Analysis and Synthesis Data Repository"
+ },
+ "ZBMED.EMP": {
+ "openaire_id": "re3data_____::r3d100010234",
+ "datacite_name": "eyeMoviePedia",
+ "official_name": "eyeMoviePedia"
+ },
+ "ZBMED.BIOFRESH": {
+ "openaire_id": "re3data_____::r3d100011651",
+ "datacite_name": "Project BioFresh, Leibniz-Institute of Freshwater Ecology and Inland Fisheries",
+ "official_name": "Freshwater Biodiversity Data Portal"
+ },
+ "INIST.IFREMER": {
+ "openaire_id": "re3data_____::r3d100011867",
+ "datacite_name": "Institut Fran\u00e7ais de Recherche pour l'Exploitation de la Mer",
+ "official_name": "SEANOE"
+ },
+ "ETHZ.SICAS": {
+ "openaire_id": "re3data_____::r3d100011560",
+ "datacite_name": "SICAS",
+ "official_name": "Sicas Medical Image Repository"
+ },
+ "SND.SND": {
+ "openaire_id": "re3data_____::r3d100010146",
+ "datacite_name": "Swedish National Data Service",
+ "official_name": "Swedish National Data Service"
+ },
+ "DELFT.EASY": {
+ "openaire_id": "re3data_____::r3d100011201",
+ "datacite_name": "DANS",
+ "official_name": "DataverseNL"
+ },
+ "WH.WHOAS": {
+ "openaire_id": "re3data_____::r3d100010423",
+ "datacite_name": "Woods Hole Open Access Server",
+ "official_name": "Woods Hole Open Access Server"
+ },
+ "DATACITE.UCSC": {
+ "openaire_id": "re3data_____::r3d100010243",
+ "datacite_name": "UCSC Genome Browser",
+ "official_name": "UCSC Genome Browser"
+ }
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json
new file mode 100644
index 0000000000..a37ae4bba0
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json
@@ -0,0 +1,39 @@
+[
+ {
+ "paramName": "t",
+ "paramLongName": "targetPath",
+ "paramDescription": "the path of the sequencial file to write",
+ "paramRequired": true
+ },
+
+ {
+ "paramName": "d",
+ "paramLongName": "dataciteDumpPath",
+ "paramDescription": "the path of the Datacite dump",
+ "paramRequired": true
+ },
+ {
+ "paramName": "s",
+ "paramLongName": "skipImport",
+ "paramDescription": "avoid to downlaod new items but apply the previous update",
+ "paramRequired": false
+ },
+ {
+ "paramName": "bs",
+ "paramLongName": "blocksize",
+ "paramDescription": "define the requests block size",
+ "paramRequired": false
+ },
+ {
+ "paramName": "n",
+ "paramLongName": "namenode",
+ "paramDescription": "the hive metastore uris",
+ "paramRequired": true
+ },
+ {
+ "paramName": "m",
+ "paramLongName": "master",
+ "paramDescription": "the master name",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml
new file mode 100644
index 0000000000..dd3c32c620
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml
@@ -0,0 +1,23 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
new file mode 100644
index 0000000000..69fc959578
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
@@ -0,0 +1,95 @@
+
+
+
+ mainPath
+ the working path of Datacite stores
+
+
+ oafTargetPath
+ the target path where the OAF records are stored
+
+
+ isLookupUrl
+ The IS lookUp service endopoint
+
+
+ blocksize
+ 100
+ The request block size
+
+
+ exportLinks
+ false
+ instructs the transformation phase to produce the links or not
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+ ${wf:conf('resumeFrom') eq 'TransformDatacite'}
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ ImportDatacite
+ eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --targetPath${mainPath}/datacite_update
+ --dataciteDumpPath${mainPath}/datacite_dump
+ --namenode${nameNode}
+ --masteryarn-cluster
+ --blocksize${blocksize}
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ TransformJob
+ eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${mainPath}/datacite_dump
+ --targetPath${oafTargetPath}
+ --isLookupUrl${isLookupUrl}
+ --exportLinks${exportLinks}
+ --masteryarn-cluster
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/config-default.xml
new file mode 100644
index 0000000000..dd3c32c620
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/config-default.xml
@@ -0,0 +1,23 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml
new file mode 100644
index 0000000000..397288c694
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml
@@ -0,0 +1,84 @@
+
+
+
+ datacitePath
+ the path of Datacite spark dataset
+
+
+ isLookupUrl
+ The IS lookUp service endopoint
+
+
+ crossrefPath
+ the path of Crossref spark dataset
+
+
+
+ targetPath
+ the path of Crossref spark dataset
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+ yarn-cluster
+ cluster
+ ImportDatacite
+ eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${datacitePath}
+ --targetPath${targetPath}/datacite_oaf
+ --isLookupUrl${isLookupUrl}
+ --exportLinkstrue
+ --masteryarn-cluster
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ FilterCrossrefEntities
+ eu.dnetlib.dhp.actionmanager.datacite.FilterCrossrefEntitiesSpark
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${crossrefPath}
+ --targetPath${targetPath}/crossref_oaf
+ --masteryarn-cluster
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml
index 8ce5818851..e4f2715fb3 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml
@@ -1,4 +1,4 @@
-
+
projectFileURL
@@ -18,6 +18,10 @@
outputPath
path where to store the action set
+
+ sheetName
+ the name of the sheet to read
+
@@ -31,10 +35,23 @@
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV
@@ -43,7 +60,7 @@
--hdfsPath${workingDir}/projects
--classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProject
-
+
@@ -55,7 +72,7 @@
--hdfsPath${workingDir}/programme
--classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme
-
+
@@ -68,7 +85,7 @@
--sheetName${sheetName}
--classForNameeu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic
-
+
@@ -81,7 +98,7 @@
--postgresUser${postgresUser}
--postgresPassword${postgresPassword}
-
+
@@ -105,10 +122,15 @@
--programmePath${workingDir}/programme
--outputPath${workingDir}/preparedProgramme
-
+
+
+
+
+
+
yarn
@@ -130,7 +152,7 @@
--outputPath${workingDir}/preparedProjects
--dbProjectPath${workingDir}/dbProjects
-
+
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json
index 39243224f4..765e9c0af7 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json
@@ -11,4 +11,4 @@
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
-]
\ No newline at end of file
+]
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/workflow.xml
index 3df5f55a5a..3d00b80a8c 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/workflow.xml
@@ -11,17 +11,17 @@
-
+
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
+
-
-
-
-
+
+
+
+
@@ -52,4 +52,4 @@
-
\ No newline at end of file
+
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json
deleted file mode 100644
index 4a6aec5ee1..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json
+++ /dev/null
@@ -1,86 +0,0 @@
-[
- {
- "paramName": "issm",
- "paramLongName": "isSparkSessionManaged",
- "paramDescription": "when true will stop SparkSession after job execution",
- "paramRequired": false
- },
- {
- "paramName": "e",
- "paramLongName": "encoding",
- "paramDescription": "the encoding of the input record should be JSON or XML",
- "paramRequired": true
- },
- {
- "paramName": "d",
- "paramLongName": "dateOfCollection",
- "paramDescription": "the date when the record has been stored",
- "paramRequired": true
- },
- {
- "paramName": "p",
- "paramLongName": "provenance",
- "paramDescription": "the infos about the provenance of the collected records",
- "paramRequired": true
- },
- {
- "paramName": "x",
- "paramLongName": "xpath",
- "paramDescription": "the xpath to identify the record identifier",
- "paramRequired": true
- },
- {
- "paramName": "i",
- "paramLongName": "input",
- "paramDescription": "the path of the sequencial file to read",
- "paramRequired": true
- },
- {
- "paramName": "o",
- "paramLongName": "output",
- "paramDescription": "the path of the result DataFrame on HDFS",
- "paramRequired": true
- },
- {
- "paramName": "ru",
- "paramLongName": "rabbitUser",
- "paramDescription": "the user to connect with RabbitMq for messaging",
- "paramRequired": true
- },
- {
- "paramName": "rp",
- "paramLongName": "rabbitPassword",
- "paramDescription": "the password to connect with RabbitMq for messaging",
- "paramRequired": true
- },
- {
- "paramName": "rh",
- "paramLongName": "rabbitHost",
- "paramDescription": "the host of the RabbitMq server",
- "paramRequired": true
- },
- {
- "paramName": "ro",
- "paramLongName": "rabbitOngoingQueue",
- "paramDescription": "the name of the ongoing queue",
- "paramRequired": true
- },
- {
- "paramName": "rr",
- "paramLongName": "rabbitReportQueue",
- "paramDescription": "the name of the report queue",
- "paramRequired": true
- },
- {
- "paramName": "w",
- "paramLongName": "workflowId",
- "paramDescription": "the identifier of the dnet Workflow",
- "paramRequired": true
- },
- {
- "paramName": "t",
- "paramLongName": "isTest",
- "paramDescription": "the name of the report queue",
- "paramRequired": false
- }
-]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json
new file mode 100644
index 0000000000..cd4b8224b2
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json
@@ -0,0 +1,62 @@
+[
+ {
+ "paramName": "a",
+ "paramLongName": "apidescriptor",
+ "paramDescription": "the JSON encoding of the API Descriptor",
+ "paramRequired": true
+ },
+ {
+ "paramName": "n",
+ "paramLongName": "namenode",
+ "paramDescription": "the Name Node URI",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mv",
+ "paramLongName": "mdStoreVersion",
+ "paramDescription": "the MDStore Version bean",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dm",
+ "paramLongName": "dnetMessageManagerURL",
+ "paramDescription": "the End point URL to send Messages",
+ "paramRequired": true
+ },
+ {
+ "paramName": "w",
+ "paramLongName": "workflowId",
+ "paramDescription": "the identifier of the dnet Workflow",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mnr",
+ "paramLongName": "maxNumberOfRetry",
+ "paramDescription": "the maximum number of admitted connection retries",
+ "paramRequired": false
+ },
+ {
+ "paramName": "rqd",
+ "paramLongName": "requestDelay",
+ "paramDescription": "the delay (ms) between requests",
+ "paramRequired": false
+ },
+ {
+ "paramName": "rtd",
+ "paramLongName": "retryDelay",
+ "paramDescription": "the delay (ms) between retries",
+ "paramRequired": false
+ },
+ {
+ "paramName": "cto",
+ "paramLongName": "connectTimeOut",
+ "paramDescription": "the maximum allowed time (ms) to connect to the remote host",
+ "paramRequired": false
+ },
+ {
+ "paramName": "rto",
+ "paramLongName": "readTimeOut",
+ "paramDescription": "the maximum allowed time (ms) to receive content from the remote host",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json
new file mode 100644
index 0000000000..987f004bbc
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json
@@ -0,0 +1,50 @@
+[
+ {
+ "paramName": "issm",
+ "paramLongName": "isSparkSessionManaged",
+ "paramDescription": "when true will stop SparkSession after job execution",
+ "paramRequired": false
+ },
+ {
+ "paramName": "e",
+ "paramLongName": "encoding",
+ "paramDescription": "the encoding of the input record should be JSON or XML",
+ "paramRequired": true
+ },
+ {
+ "paramName": "d",
+ "paramLongName": "dateOfCollection",
+ "paramDescription": "the date when the record has been stored",
+ "paramRequired": true
+ },
+ {
+ "paramName": "p",
+ "paramLongName": "provenance",
+ "paramDescription": "the infos about the provenance of the collected records",
+ "paramRequired": true
+ },
+ {
+ "paramName": "x",
+ "paramLongName": "xpath",
+ "paramDescription": "the xpath to identify the record identifier",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mv",
+ "paramLongName": "mdStoreVersion",
+ "paramDescription": "the Metadata Store Version Info",
+ "paramRequired": true
+ },
+ {
+ "paramName": "rmv",
+ "paramLongName": "readMdStoreVersion",
+ "paramDescription": "the Read Lock Metadata Store Version bean",
+ "paramRequired": false
+ },
+ {
+ "paramName": "w",
+ "paramLongName": "workflowId",
+ "paramDescription": "the identifier of the dnet Workflow",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json
new file mode 100644
index 0000000000..57a218a342
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json
@@ -0,0 +1,45 @@
+[
+ {
+ "paramName": "a",
+ "paramLongName": "action",
+ "paramDescription": "the JSON encoding of the API Descriptor",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mu",
+ "paramLongName": "mdStoreManagerURI",
+ "paramDescription": "the MDStore Manager URI",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mi",
+ "paramLongName": "mdStoreID",
+ "paramDescription": "the Metadata Store ID",
+ "paramRequired": false
+ },
+ {
+ "paramName": "ms",
+ "paramLongName": "mdStoreSize",
+ "paramDescription": "the Metadata Store Size",
+ "paramRequired": false
+ },
+ {
+ "paramName": "mv",
+ "paramLongName": "mdStoreVersion",
+ "paramDescription": "the Metadata Version Bean",
+ "paramRequired": false
+ },
+ {
+ "paramName": "n",
+ "paramLongName": "namenode",
+ "paramDescription": "the Name Node URI",
+ "paramRequired": false
+ },
+ {
+ "paramName": "rm",
+ "paramLongName": "readMDStoreId",
+ "paramDescription": "the ID Locked to Read",
+ "paramRequired": false
+ }
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml
similarity index 62%
rename from dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/config-default.xml
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml
index 7c1a43e513..e77dd09c9d 100644
--- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/synch/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml
@@ -1,4 +1,12 @@
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
oozie.use.system.libpath
true
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
index 3e7f684012..0678eed117 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
@@ -1,112 +1,212 @@
-
- sequenceFilePath
- the path to store the sequence file of the native metadata collected
-
-
-
- mdStorePath
- the path of the native mdstore
-
-
apiDescription
A json encoding of the API Description class
-
dataSourceInfo
A json encoding of the Datasource Info
identifierPath
- An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier
+ An xpath to retrieve the metadata identifier for the generation of DNet Identifier
-
metadataEncoding
The type of the metadata XML/JSON
-
timestamp
The timestamp of the collection date
-
workflowId
The identifier of the workflow
+
+ mdStoreID
+ The identifier of the mdStore
+
+
+ mdStoreManagerURI
+ The URI of the MDStore Manager
+
+
+
+ dnetMessageManagerURL
+ The URI of the Dnet Message Manager
+
+
+ collectionMode
+ Should be REFRESH or INCREMENTAL
+
+
+
+ collection_java_xmx
+ -Xmx200m
+ Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.
+
+
+
-
+
+ ${jobTracker}
+ ${nameNode}
+
+
+
+
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
-
-
-
+
+
+
+ ${wf:conf('collectionMode') eq 'REFRESH'}
+ ${wf:conf('collectionMode') eq 'INCREMENTAL'}
+
+
+
+
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ ${collection_java_xmx}
+ --actionREAD_LOCK
+ --mdStoreID${mdStoreID}
+ --mdStoreManagerURI${mdStoreManagerURI}
+
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ ${collection_java_xmx}
+ --actionNEW_VERSION
+ --mdStoreID${mdStoreID}
+ --mdStoreManagerURI${mdStoreManagerURI}
+
+
+
- ${jobTracker}
- ${nameNode}
- eu.dnetlib.dhp.collection.worker.DnetCollectorWorker
- -p${sequenceFilePath}
- -a${apiDescription}
- -n${nameNode}
- -rh${rmq_host}
- -ru${rmq_user}
- -rp${rmq_pwd}
- -rr${rmq_report}
- -ro${rmq_ongoing}
- -usandro.labruzzo
- -w${workflowId}
+ eu.dnetlib.dhp.collection.CollectorWorkerApplication
+ ${collection_java_xmx}
+ --apidescriptor${apiDescription}
+ --namenode${nameNode}
+ --workflowId${workflowId}
+ --dnetMessageManagerURL${dnetMessageManagerURL}
+ --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']}
+ --maxNumberOfRetry${maxNumberOfRetry}
+ --requestDelay${requestDelay}
+ --retryDelay${retryDelay}
+ --connectTimeOut${connectTimeOut}
+ --readTimeOut${readTimeOut}
-
-
-
-
- ${jobTracker}
- ${nameNode}
- yarn
- cluster
- GenerateNativeStoreSparkJob
- eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob
- dhp-aggregations-1.0.0-SNAPSHOT.jar
- --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
- --encoding ${metadataEncoding}
- --dateOfCollection ${timestamp}
- --provenance ${dataSourceInfo}
- --xpath${identifierPath}
- --input${sequenceFilePath}
- --output${mdStorePath}
- -rh${rmq_host}
- -ru${rmq_user}
- -rp${rmq_pwd}
- -rr${rmq_report}
- -ro${rmq_ongoing}
- -w${workflowId}
-
-
-
+
-
-
-
-
+
+
+ yarn
+ cluster
+ Generate Native MetadataStore
+ eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --encoding${metadataEncoding}
+ --dateOfCollection${timestamp}
+ --provenance${dataSourceInfo}
+ --xpath${identifierPath}
+ --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']}
+ --readMdStoreVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']}
+
+
+
+
+
+
+
+ ${wf:conf('collectionMode') eq 'REFRESH'}
+ ${wf:conf('collectionMode') eq 'INCREMENTAL'}
+
+
+
+
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ ${collection_java_xmx}
+ --actionREAD_UNLOCK
+ --mdStoreManagerURI${mdStoreManagerURI}
+ --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']}
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ ${collection_java_xmx}
+ --actionCOMMIT
+ --namenode${nameNode}
+ --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']}
+ --mdStoreManagerURI${mdStoreManagerURI}
+
+
+
+
+
+
+
+ ${wf:conf('collectionMode') eq 'REFRESH'}
+ ${wf:conf('collectionMode') eq 'INCREMENTAL'}
+
+
+
+
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ ${collection_java_xmx}
+ --actionREAD_UNLOCK
+ --mdStoreManagerURI${mdStoreManagerURI}
+ --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']}
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ ${collection_java_xmx}
+ --actionROLLBACK
+ --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']}
+ --mdStoreManagerURI${mdStoreManagerURI}
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json
deleted file mode 100644
index c247d15e4d..0000000000
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json
+++ /dev/null
@@ -1,12 +0,0 @@
-[
- {"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
- {"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
- {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
- {"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true},
- {"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true},
- {"paramName":"rp", "paramLongName":"rabbitPassword", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true},
- {"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true},
- {"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true},
- {"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true},
- {"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true}
-]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml
new file mode 100644
index 0000000000..bdd48b0ab2
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml
@@ -0,0 +1,19 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml
index 4b1e3d84bb..fd17289a3e 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml
@@ -1,76 +1,194 @@
-
+
- mdstoreInputPath
- the path of the input MDStore
+ mdStoreInputId
+ the identifier of the native MDStore
-
- mdstoreOutputPath
+ mdStoreOutputId
+ the identifier of the cleaned MDStore
+
+
+ mdStoreManagerURI
the path of the cleaned mdstore
-
- transformationRule
+ transformationRuleId
The transformation Rule to apply
-
- timestamp
- The timestamp of the collection date
+ transformationPlugin
+ XSLT_TRANSFORM
+ The transformation Plugin
+
+
+ dateOfTransformation
+ The timestamp of the transformation date
+
+
+ isLookupUrl
+ The IS lookUp service endopoint
-
workflowId
The identifier of the workflow
+
+ dnetMessageManagerURL
+ The URI of the Dnet Message Manager
+
+
+ recordsPerTask
+ 200
+ The URI of the Dnet Message Manager
+
+
-
+
+
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
-
-
+
+
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ --actionREAD_LOCK
+ --mdStoreID${mdStoreInputId}
+ --mdStoreManagerURI${mdStoreManagerURI}
+
+
+
-
-
- ${jobTracker}
- ${nameNode}
- yarn
- cluster
- MDBuilder
- eu.dnetlib.dhp.transformation.TransformSparkJobNode
- dhp-aggregations-1.0.0-SNAPSHOT.jar
- --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
- --dateOfCollection ${timestamp}
- -mt yarn
- --input${mdstoreInputPath}
- --output${mdstoreOutputPath}
- -w${workflowId}
- -tr${transformationRule}
- -ru${rmq_user}
- -rp${rmq_pwd}
- -rh${rmq_host}
- -ro${rmq_ongoing}
- -rr${rmq_report}
-
-
-
+
+
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ --actionNEW_VERSION
+ --mdStoreID${mdStoreOutputId}
+ --mdStoreManagerURI${mdStoreManagerURI}
+
+
+
+
-
-
-
-
+
+
+ yarn
+ cluster
+ Transform MetadataStore
+ eu.dnetlib.dhp.transformation.TransformSparkJobNode
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --mdstoreOutputVersion${wf:actionData('StartTransaction')['mdStoreVersion']}
+ --mdstoreInputVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']}
+ --dateOfTransformation${dateOfTransformation}
+ --transformationPlugin${transformationPlugin}
+ --transformationRuleId${transformationRuleId}
+ --isLookupUrl${isLookupUrl}
+ --recordsPerTask${recordsPerTask}
+ --workflowId${workflowId}
+ --dnetMessageManagerURL${dnetMessageManagerURL}
+
+
+
+
+
+
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ --actionREAD_UNLOCK
+ --mdStoreManagerURI${mdStoreManagerURI}
+ --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']}
+
+
+
+
+
+
+
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ --actionCOMMIT
+ --namenode${nameNode}
+ --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']}
+ --mdStoreManagerURI${mdStoreManagerURI}
+
+
+
+
+
+
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ --actionREAD_UNLOCK
+ --mdStoreManagerURI${mdStoreManagerURI}
+ --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']}
+
+
+
+
+
+
+
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode
+ --actionROLLBACK
+ --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']}
+ --mdStoreManagerURI${mdStoreManagerURI}
+
-
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json
index 4bb5fd56a0..4cc2da0c46 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json
@@ -7,20 +7,39 @@
},
{
"paramName": "d",
- "paramLongName": "dateOfCollection",
+ "paramLongName": "dateOfTransformation",
"paramDescription": "the date when the record has been stored",
"paramRequired": true
},
{
"paramName": "i",
- "paramLongName": "input",
- "paramDescription": "the path of the sequencial file to read",
+ "paramLongName": "mdstoreInputVersion",
+ "paramDescription": "the mdStore Version bean of the Input",
"paramRequired": true
},
{
"paramName": "o",
- "paramLongName": "output",
- "paramDescription": "the path of the result DataFrame on HDFS",
+ "paramLongName": "mdstoreOutputVersion",
+ "paramDescription": "the mdStore Version bean of the Output",
+ "paramRequired": true
+ },
+ {
+ "paramName": "tr",
+ "paramLongName": "transformationRuleId",
+ "paramDescription": "the transformation Rule to apply to the input MDStore",
+ "paramRequired": true
+ },
+
+ {
+ "paramName": "i",
+ "paramLongName": "isLookupUrl",
+ "paramDescription": "the Information System Service LookUp URL",
+ "paramRequired": true
+ },
+ {
+ "paramName": "dm",
+ "paramLongName": "dnetMessageManagerURL",
+ "paramDescription": "the End point URL to send Messages",
"paramRequired": true
},
{
@@ -30,45 +49,15 @@
"paramRequired": true
},
{
- "paramName": "tr",
- "paramLongName": "transformationRule",
- "paramDescription": "the transformation Rule to apply to the input MDStore",
- "paramRequired": true
- },
- {
- "paramName": "ru",
- "paramLongName": "rabbitUser",
- "paramDescription": "the user to connect with RabbitMq for messaging",
- "paramRequired": true
- },
- {
- "paramName": "rp",
- "paramLongName": "rabbitPassword",
- "paramDescription": "the password to connect with RabbitMq for messaging",
- "paramRequired": true
- },
- {
- "paramName": "rh",
- "paramLongName": "rabbitHost",
- "paramDescription": "the host of the RabbitMq server",
- "paramRequired": true
- },
- {
- "paramName": "ro",
- "paramLongName": "rabbitOngoingQueue",
- "paramDescription": "the name of the ongoing queue",
- "paramRequired": true
- },
- {
- "paramName": "rr",
- "paramLongName": "rabbitReportQueue",
- "paramDescription": "the name of the report queue",
- "paramRequired": true
- },
- {
- "paramName": "t",
- "paramLongName": "isTest",
- "paramDescription": "the name of the report queue",
+ "paramName": "rpt",
+ "paramLongName": "recordsPerTask",
+ "paramDescription": "the number of records transformed by a single Task",
"paramRequired": false
+ },
+ {
+ "paramName": "tp",
+ "paramLongName": "transformationPlugin",
+ "paramDescription": "the transformation plugin to apply",
+ "paramRequired": true
}
]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala
new file mode 100644
index 0000000000..0d10c41dca
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala
@@ -0,0 +1,44 @@
+package eu.dnetlib.dhp.actionmanager.datacite
+
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.databind.SerializationFeature
+
+import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import org.junit.jupiter.api.extension.ExtendWith
+import org.junit.jupiter.api.{BeforeEach, Test}
+import org.mockito.junit.jupiter.MockitoExtension
+
+import scala.io.Source
+
+@ExtendWith(Array(classOf[MockitoExtension]))
+class DataciteToOAFTest extends AbstractVocabularyTest{
+
+
+ @BeforeEach
+ def setUp() :Unit = {
+
+ super.setUpVocabulary()
+ }
+
+ @Test
+ def testMapping() :Unit = {
+ val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString
+
+
+
+ val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
+ val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
+
+ res.foreach(r => {
+ println (mapper.writeValueAsString(r))
+ println("----------------------------")
+
+ })
+
+
+
+ }
+
+}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java
index 72ba48f418..b7155bc3a4 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java
@@ -12,16 +12,16 @@ import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
-import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
-import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
+import eu.dnetlib.dhp.collection.CollectorException;
+import eu.dnetlib.dhp.collection.HttpConnector2;
@Disabled
public class EXCELParserTest {
private static Path workingDir;
- private HttpConnector httpConnector = new HttpConnector();
- private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
+ private HttpConnector2 httpConnector = new HttpConnector2();
+ private static final String URL = "https://cordis.europa.eu/data/reference/cordisref-h2020topics.xlsx";
@BeforeAll
public static void beforeAll() throws IOException {
@@ -30,16 +30,17 @@ public class EXCELParserTest {
}
@Test
- public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException,
+ public void test1() throws CollectorException, IOException, InvalidFormatException, ClassNotFoundException,
IllegalAccessException, InstantiationException {
EXCELParser excelParser = new EXCELParser();
- final String classForName = "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic";
- final String sheetName = "Topics";
- List