diff --git a/dhp-applications/dhp-collector-worker/README.md b/dhp-applications/dhp-collector-worker/README.md
new file mode 100644
index 000000000..b6609179d
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/README.md
@@ -0,0 +1,16 @@
+Description of the Module
+--------------------------
+This module defines a **collector worker application** that run on Hadoop.
+
+It is responsible of harvesting metadata using different plugin, that should be passed as arguments
+in the main class
+
+##Plugins
+* OAI Plugin
+
+
+## Api Descriptor
+TODO
+
+## Usage
+TODO
\ No newline at end of file
diff --git a/dhp-applications/dhp-collector-worker/pom.xml b/dhp-applications/dhp-collector-worker/pom.xml
new file mode 100644
index 000000000..25d470020
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/pom.xml
@@ -0,0 +1,85 @@
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-applications
+ 1.0.0-SNAPSHOT
+ ../
+
+
+ 4.0.0
+
+ eu.dnetlib
+ dhp-collector-worker
+ 1.0.0
+
+
+
+
+
+ cloudera
+ Cloudera Repository
+ https://repository.cloudera.com/artifactory/cloudera-repos
+
+ true
+
+
+ false
+
+
+
+
+
+
+ org.springframework.boot
+ spring-boot-starter
+
+
+ org.apache.hadoop
+ hadoop-client
+ 2.6.0-cdh5.9.2
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+ dom4j
+ dom4j
+
+
+ jaxen
+ jaxen
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+
+
+
+
+
+ UTF-8
+ UTF-8
+ cdh5.9.2
+ 2.6.0-${dhp.cdh.version}
+ 2.2.0
+ 2.11.8
+
+
+
+
\ No newline at end of file
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java
new file mode 100644
index 000000000..bc4287a0d
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java
@@ -0,0 +1,30 @@
+package eu.dnetlib.collector.worker;
+
+public class DnetCollectorException extends Exception {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -290723075076039757L;
+
+ public DnetCollectorException() {
+ super();
+ }
+
+ public DnetCollectorException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) {
+ super(message, cause, enableSuppression, writableStackTrace);
+ }
+
+ public DnetCollectorException(final String message, final Throwable cause) {
+ super(message, cause);
+ }
+
+ public DnetCollectorException(final String message) {
+ super(message);
+ }
+
+ public DnetCollectorException(final Throwable cause) {
+ super(cause);
+ }
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java
new file mode 100644
index 000000000..c2854b8f2
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java
@@ -0,0 +1,121 @@
+package eu.dnetlib.collector.worker;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.collector.worker.model.ApiDescriptor;
+import eu.dnetlib.collector.worker.plugins.CollectorPlugin;
+import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.CommandLineRunner;
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.concurrent.atomic.AtomicInteger;
+
+
+/**
+ *
+ * DnetCollectortWorkerApplication is the main class responsible to start
+ * the Dnet Collection into HDFS.
+ * This module will be executed on the hadoop cluster and taking in input some parameters
+ * that tells it which is the right collector plugin to use and where store the data into HDFS path
+ *
+ *
+ * @author Sandro La Bruzzo
+ */
+@SpringBootApplication
+public class DnetCollectorWorkerApplication implements CommandLineRunner {
+
+ private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class);
+
+ @Autowired
+ private CollectorPluginEnumerator collectorPluginEnumerator;
+
+ /**
+ *
+ * @param args
+ */
+ public static void main(final String[] args) {
+ SpringApplication.run(DnetCollectorWorkerApplication.class, args);
+ }
+
+ /**
+ * This module expect two arguments:
+ * path hdfs where store the sequential file.
+ * Json serialization of {@link ApiDescriptor}
+ */
+ @Override
+ public void run(final String... args) throws Exception {
+ if (args.length == 0) { return; }
+ if (args.length != 3) { throw new DnetCollectorException("Invalid number of parameters, expected: hdfs_path and json_api_description"); }
+
+ //TODO : migrate to https://commons.apache.org/proper/commons-cli/usage.html
+
+
+
+
+ final String hdfsPath = args[0];
+
+ log.info("hdfsPath ="+hdfsPath);
+
+ final String json = args[1];
+
+
+ final String nameNode = args[2];
+
+ log.info("json = "+json);
+ final ObjectMapper jsonMapper = new ObjectMapper();
+ final ApiDescriptor api = jsonMapper.readValue(json, ApiDescriptor.class);
+
+ final CollectorPlugin plugin = collectorPluginEnumerator.getPluginByProtocol(api.getProtocol());
+
+ final String hdfsuri =nameNode;
+
+ // ====== Init HDFS File System Object
+ Configuration conf = new Configuration();
+ // Set FileSystem URI
+ conf.set("fs.defaultFS", hdfsuri);
+ // Because of Maven
+ conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+ conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+
+ System.setProperty("HADOOP_USER_NAME", "sandro.labruzzo");
+ System.setProperty("hadoop.home.dir", "/");
+ //Get the filesystem - HDFS
+ FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf);
+ Path hdfswritepath = new Path(hdfsPath);
+
+ log.info("Created path "+hdfswritepath.toString());
+
+ try(SequenceFile.Writer writer = SequenceFile.createWriter(conf,
+ SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class),
+ SequenceFile.Writer.valueClass(Text.class))) {
+
+ final AtomicInteger counter = new AtomicInteger(0);
+ final IntWritable key = new IntWritable(counter.get());
+ final Text value = new Text();
+
+ plugin.collect(api).forEach(content -> {
+
+ key.set(counter.getAndIncrement());
+ value.set(content);
+ try {
+ writer.append(key, value);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ });
+ }
+ }
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java
new file mode 100644
index 000000000..6874d97f4
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java
@@ -0,0 +1,50 @@
+package eu.dnetlib.collector.worker.model;
+
+import java.util.HashMap;
+import java.util.Map;
+
+
+//TODO sholud be moved on dhp-common
+public class ApiDescriptor {
+
+ private String id;
+
+ private String baseUrl;
+
+ private String protocol;
+
+ private Map params = new HashMap<>();
+
+ public String getBaseUrl() {
+ return baseUrl;
+ }
+
+ public void setBaseUrl(final String baseUrl) {
+ this.baseUrl = baseUrl;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(final String id) {
+ this.id = id;
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ public void setParams(final HashMap params) {
+ this.params = params;
+ }
+
+ public String getProtocol() {
+ return protocol;
+ }
+
+ public void setProtocol(final String protocol) {
+ this.protocol = protocol;
+ }
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java
new file mode 100644
index 000000000..5ec1e9a6e
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java
@@ -0,0 +1,11 @@
+package eu.dnetlib.collector.worker.plugins;
+
+import java.util.stream.Stream;
+
+import eu.dnetlib.collector.worker.DnetCollectorException;
+import eu.dnetlib.collector.worker.model.ApiDescriptor;
+
+public interface CollectorPlugin {
+
+ Stream collect(ApiDescriptor api) throws DnetCollectorException;
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java
new file mode 100644
index 000000000..7c4bec192
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java
@@ -0,0 +1,66 @@
+package eu.dnetlib.collector.worker.plugins.oai;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Spliterator;
+import java.util.Spliterators;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.collector.worker.DnetCollectorException;
+import eu.dnetlib.collector.worker.model.ApiDescriptor;
+import eu.dnetlib.collector.worker.plugins.CollectorPlugin;
+import eu.dnetlib.collector.worker.utils.DnetWorkerCollector;
+
+@Component
+@DnetWorkerCollector("oai")
+public class OaiCollectorPlugin implements CollectorPlugin {
+
+ private static final String FORMAT_PARAM = "format";
+ private static final String OAI_SET_PARAM = "set";
+ private static final Object OAI_FROM_DATE_PARAM = "fromDate";
+ private static final Object OAI_UNTIL_DATE_PARAM = "untilDate";
+
+ @Autowired
+ private OaiIteratorFactory oaiIteratorFactory;
+
+ @Override
+ public Stream collect(final ApiDescriptor api) throws DnetCollectorException {
+ final String baseUrl = api.getBaseUrl();
+ final String mdFormat = api.getParams().get(FORMAT_PARAM);
+ final String setParam = api.getParams().get(OAI_SET_PARAM);
+ final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM);
+ final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM);
+
+ final List sets = new ArrayList<>();
+ if (setParam != null) {
+ sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam)));
+ }
+ if (sets.isEmpty()) {
+ // If no set is defined, ALL the sets must be harvested
+ sets.add("");
+ }
+
+ if (baseUrl == null || baseUrl.isEmpty()) { throw new DnetCollectorException("Param 'baseurl' is null or empty"); }
+
+ if (mdFormat == null || mdFormat.isEmpty()) { throw new DnetCollectorException("Param 'mdFormat' is null or empty"); }
+
+ if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); }
+
+ if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); }
+
+ final Iterator> iters = sets.stream()
+ .map(set -> oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
+ .iterator();
+
+ return StreamSupport.stream(Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false);
+ }
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java
new file mode 100644
index 000000000..191b7b596
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java
@@ -0,0 +1,163 @@
+package eu.dnetlib.collector.worker.plugins.oai;
+
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+import java.util.Iterator;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Document;
+import org.dom4j.DocumentException;
+import org.dom4j.Node;
+import org.dom4j.io.SAXReader;
+
+import eu.dnetlib.collector.worker.DnetCollectorException;
+import eu.dnetlib.collector.worker.utils.HttpConnector;
+import eu.dnetlib.collector.worker.utils.XmlCleaner;
+
+public class OaiIterator implements Iterator {
+
+ private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
+
+ private final Queue queue = new PriorityBlockingQueue<>();
+ private final SAXReader reader = new SAXReader();
+
+ private final String baseUrl;
+ private final String set;
+ private final String mdFormat;
+ private final String fromDate;
+ private final String untilDate;
+ private String token;
+ private boolean started;
+ private final HttpConnector httpConnector;
+
+ public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate,
+ final HttpConnector httpConnector) {
+ this.baseUrl = baseUrl;
+ this.mdFormat = mdFormat;
+ this.set = set;
+ this.fromDate = fromDate;
+ this.untilDate = untilDate;
+ this.started = false;
+ this.httpConnector = httpConnector;
+ }
+
+ private void verifyStarted() {
+ if (!this.started) {
+ this.started = true;
+ try {
+ this.token = firstPage();
+ } catch (final DnetCollectorException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ @Override
+ public boolean hasNext() {
+ synchronized (queue) {
+ verifyStarted();
+ return !queue.isEmpty();
+ }
+ }
+
+ @Override
+ public String next() {
+ synchronized (queue) {
+ verifyStarted();
+ final String res = queue.poll();
+ while (queue.isEmpty() && token != null && !token.isEmpty()) {
+ try {
+ token = otherPages(token);
+ } catch (final DnetCollectorException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return res;
+ }
+ }
+
+ @Override
+ public void remove() {}
+
+ private String firstPage() throws DnetCollectorException {
+ try {
+ String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8");
+ if (set != null && !set.isEmpty()) {
+ url += "&set=" + URLEncoder.encode(set, "UTF-8");
+ }
+ if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
+ url += "&from=" + URLEncoder.encode(fromDate, "UTF-8");
+ }
+ if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
+ url += "&until=" + URLEncoder.encode(untilDate, "UTF-8");
+ }
+ log.info("Start harvesting using url: " + url);
+
+ return downloadPage(url);
+ } catch (final UnsupportedEncodingException e) {
+ throw new DnetCollectorException(e);
+ }
+ }
+
+ private String extractResumptionToken(final String xml) {
+
+ final String s = StringUtils.substringAfter(xml, "", "");
+ if (result == null) { return null; }
+ return result.trim();
+
+ }
+
+ private String otherPages(final String resumptionToken) throws DnetCollectorException {
+ try {
+ return downloadPage(baseUrl + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken, "UTF-8"));
+ } catch (final UnsupportedEncodingException e) {
+ throw new DnetCollectorException(e);
+ }
+ }
+
+ private String downloadPage(final String url) throws DnetCollectorException {
+
+ final String xml = httpConnector.getInputSource(url);
+ Document doc;
+ try {
+ doc = reader.read(new StringReader(xml));
+ } catch (final DocumentException e) {
+ log.warn("Error parsing xml, I try to clean it: " + xml, e);
+ final String cleaned = XmlCleaner.cleanAllEntities(xml);
+ try {
+ doc = reader.read(new StringReader(cleaned));
+ } catch (final DocumentException e1) {
+ final String resumptionToken = extractResumptionToken(xml);
+ if (resumptionToken == null) { throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1); }
+ return resumptionToken;
+ }
+ }
+
+ final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
+ if (errorNode != null) {
+ final String code = errorNode.valueOf("@code");
+ if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
+ log.warn("noRecordsMatch for oai call: " + url);
+ return null;
+ } else {
+ throw new DnetCollectorException(code + " - " + errorNode.getText());
+ }
+ }
+
+ for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
+ queue.add(((Node) o).asXML());
+ }
+
+ return doc.valueOf("//*[local-name()='resumptionToken']");
+
+ }
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIteratorFactory.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIteratorFactory.java
new file mode 100644
index 000000000..16af15191
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIteratorFactory.java
@@ -0,0 +1,20 @@
+package eu.dnetlib.collector.worker.plugins.oai;
+
+import java.util.Iterator;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+import eu.dnetlib.collector.worker.utils.HttpConnector;
+
+@Component
+public class OaiIteratorFactory {
+
+ @Autowired
+ private HttpConnector httpConnector;
+
+ public Iterator newIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate) {
+ return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, httpConnector);
+ }
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java
new file mode 100644
index 000000000..3f5b245ef
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java
@@ -0,0 +1,24 @@
+package eu.dnetlib.collector.worker.utils;
+
+import java.util.List;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+import eu.dnetlib.collector.worker.plugins.CollectorPlugin;
+
+@Component
+public class CollectorPluginEnumerator {
+
+ @Autowired
+ private List plugins;
+
+ public CollectorPlugin getPluginByProtocol(final String protocol) {
+ return plugins.stream()
+ .filter(p -> p.getClass().isAnnotationPresent(DnetWorkerCollector.class))
+ .filter(p -> p.getClass().getAnnotation(DnetWorkerCollector.class).value().equalsIgnoreCase(protocol))
+ .findFirst()
+ .get();
+ }
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java
new file mode 100644
index 000000000..062f6c7a8
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java
@@ -0,0 +1,19 @@
+package eu.dnetlib.collector.worker.utils;
+
+import java.util.LinkedList;
+
+public class CollectorPluginErrorLogList extends LinkedList {
+
+ private static final long serialVersionUID = -6925786561303289704L;
+
+ @Override
+ public String toString() {
+ String log = new String();
+ int index = 0;
+ for (final String errorMessage : this) {
+ log += String.format("Retry #%s: %s / ", index++, errorMessage);
+ }
+ return log;
+ }
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java
new file mode 100644
index 000000000..28891c84f
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java
@@ -0,0 +1,14 @@
+package eu.dnetlib.collector.worker.utils;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface DnetWorkerCollector {
+
+ String value();
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java
new file mode 100644
index 000000000..8a24381e5
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java
@@ -0,0 +1,226 @@
+package eu.dnetlib.collector.worker.utils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.CookieHandler;
+import java.net.CookieManager;
+import java.net.CookiePolicy;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.security.GeneralSecurityException;
+import java.security.cert.X509Certificate;
+import java.util.List;
+import java.util.Map;
+
+import javax.net.ssl.HttpsURLConnection;
+import javax.net.ssl.SSLContext;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.math.NumberUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.springframework.stereotype.Component;
+
+import eu.dnetlib.collector.worker.DnetCollectorException;
+
+@Component
+public class HttpConnector {
+
+ private static final Log log = LogFactory.getLog(HttpConnector.class);
+
+ private int maxNumberOfRetry = 6;
+ private int defaultDelay = 120; // seconds
+ private int readTimeOut = 120; // seconds
+
+ private String responseType = null;
+
+ private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
+
+ public HttpConnector() {
+ CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
+ }
+
+ /**
+ * Given the URL returns the content via HTTP GET
+ *
+ * @param requestUrl
+ * the URL
+ * @return the content of the downloaded resource
+ * @throws CollectorServiceException
+ * when retrying more than maxNumberOfRetry times
+ */
+ public String getInputSource(final String requestUrl) throws DnetCollectorException {
+ return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
+ }
+
+ /**
+ * Given the URL returns the content as a stream via HTTP GET
+ *
+ * @param requestUrl
+ * the URL
+ * @return the content of the downloaded resource as InputStream
+ * @throws CollectorServiceException
+ * when retrying more than maxNumberOfRetry times
+ */
+ public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException {
+ return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
+ }
+
+ private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
+ throws DnetCollectorException {
+ try {
+ final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
+ try {
+ return IOUtils.toString(s);
+ } catch (final IOException e) {
+ log.error("error while retrieving from http-connection occured: " + requestUrl, e);
+ Thread.sleep(defaultDelay * 1000);
+ errorList.add(e.getMessage());
+ return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
+ } finally {
+ IOUtils.closeQuietly(s);
+ }
+ } catch (final InterruptedException e) {
+ throw new DnetCollectorException(e);
+ }
+ }
+
+ private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
+ throws DnetCollectorException {
+
+ if (retryNumber > maxNumberOfRetry) { throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); }
+
+ log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
+ try {
+ InputStream input = null;
+
+ try {
+ final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
+ urlConn.setInstanceFollowRedirects(false);
+ urlConn.setReadTimeout(readTimeOut * 1000);
+ urlConn.addRequestProperty("User-Agent", userAgent);
+
+ if (log.isDebugEnabled()) {
+ logHeaderFields(urlConn);
+ }
+
+ final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
+ if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
+ log.warn("waiting and repeating request after " + retryAfter + " sec.");
+ Thread.sleep(retryAfter * 1000);
+ errorList.add("503 Service Unavailable");
+ urlConn.disconnect();
+ return attemptDownload(requestUrl, retryNumber + 1, errorList);
+ } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
+ final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
+ log.debug("The requested url has been moved to " + newUrl);
+ errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
+ urlConn.disconnect();
+ return attemptDownload(newUrl, retryNumber + 1, errorList);
+ } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
+ log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
+ Thread.sleep(defaultDelay * 1000);
+ errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
+ urlConn.disconnect();
+ return attemptDownload(requestUrl, retryNumber + 1, errorList);
+ } else {
+ input = urlConn.getInputStream();
+ responseType = urlConn.getContentType();
+ return input;
+ }
+ } catch (final IOException e) {
+ log.error("error while retrieving from http-connection occured: " + requestUrl, e);
+ Thread.sleep(defaultDelay * 1000);
+ errorList.add(e.getMessage());
+ return attemptDownload(requestUrl, retryNumber + 1, errorList);
+ }
+ } catch (final InterruptedException e) {
+ throw new DnetCollectorException(e);
+ }
+ }
+
+ private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
+ log.debug("StatusCode: " + urlConn.getResponseMessage());
+
+ for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) {
+ if (e.getKey() != null) {
+ for (final String v : e.getValue()) {
+ log.debug(" key: " + e.getKey() + " - value: " + v);
+ }
+ }
+ }
+ }
+
+ private int obtainRetryAfter(final Map> headerMap) {
+ for (final String key : headerMap.keySet()) {
+ if (key != null && key.toLowerCase().equals("retry-after") && headerMap.get(key).size() > 0
+ && NumberUtils.isNumber(headerMap.get(key).get(0))) { return Integer.parseInt(headerMap.get(key).get(0)) + 10; }
+ }
+ return -1;
+ }
+
+ private String obtainNewLocation(final Map> headerMap) throws DnetCollectorException {
+ for (final String key : headerMap.keySet()) {
+ if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { return headerMap.get(key).get(0); }
+ }
+ throw new DnetCollectorException("The requested url has been MOVED, but 'location' param is MISSING");
+ }
+
+ /**
+ * register for https scheme; this is a workaround and not intended for the use in trusted environments
+ */
+ public void initTrustManager() {
+ final X509TrustManager tm = new X509TrustManager() {
+
+ @Override
+ public void checkClientTrusted(final X509Certificate[] xcs, final String string) {}
+
+ @Override
+ public void checkServerTrusted(final X509Certificate[] xcs, final String string) {}
+
+ @Override
+ public X509Certificate[] getAcceptedIssuers() {
+ return null;
+ }
+ };
+ try {
+ final SSLContext ctx = SSLContext.getInstance("TLS");
+ ctx.init(null, new TrustManager[] { tm }, null);
+ HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
+ } catch (final GeneralSecurityException e) {
+ log.fatal(e);
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public int getMaxNumberOfRetry() {
+ return maxNumberOfRetry;
+ }
+
+ public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
+ this.maxNumberOfRetry = maxNumberOfRetry;
+ }
+
+ public int getDefaultDelay() {
+ return defaultDelay;
+ }
+
+ public void setDefaultDelay(final int defaultDelay) {
+ this.defaultDelay = defaultDelay;
+ }
+
+ public int getReadTimeOut() {
+ return readTimeOut;
+ }
+
+ public void setReadTimeOut(final int readTimeOut) {
+ this.readTimeOut = readTimeOut;
+ }
+
+ public String getResponseType() {
+ return responseType;
+ }
+
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java
new file mode 100644
index 000000000..7d1121a6d
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java
@@ -0,0 +1,259 @@
+package eu.dnetlib.collector.worker.utils;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * @author jochen, Andreas Czerniak
+ *
+ */
+public class XmlCleaner {
+
+ /**
+ * Pattern for numeric entities.
+ */
+ private static Pattern validCharacterEntityPattern = Pattern.compile("^?\\d{2,4};"); //$NON-NLS-1$
+ // private static Pattern validCharacterEntityPattern = Pattern.compile("^?\\d{2,4};"); //$NON-NLS-1$
+
+ // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to
+ private static Pattern invalidControlCharPattern = Pattern.compile("?1[0-9a-fA-F];");
+
+ /**
+ * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
+ * [#x10000-#x10FFFF]
+ */
+ private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
+
+ // Map entities to their unicode equivalent
+ private static Set goodEntities = new HashSet<>();
+ private static Map badEntities = new HashMap<>();
+
+ static {
+ // pre-defined XML entities
+ goodEntities.add("""); //$NON-NLS-1$ // quotation mark
+ goodEntities.add("&"); //$NON-NLS-1$ // ampersand
+ goodEntities.add("<"); //$NON-NLS-1$ // less-than sign
+ goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign
+ // control entities
+ // badEntities.put("", "");
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("
", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+ // misc entities
+ badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro
+ badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
+ badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
+ // Latin 1 entities
+ badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space
+ badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
+ badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign
+ badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign
+ badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign
+ badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign
+ badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
+ badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign
+ badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis
+ badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign
+ badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
+ badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
+ badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign
+ badEntities.put("", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
+ badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign
+ badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron
+ badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign
+ badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
+ badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two
+ badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three
+ badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent
+ badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign
+ badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
+ badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot
+ badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla
+ badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one
+ badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
+ badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
+ badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
+ badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
+ badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
+ badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
+ badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
+ badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
+ badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
+ badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
+ badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
+ badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
+ badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
+ badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
+ badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
+ badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
+ badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
+ badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
+ badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
+ badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
+ badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
+ badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
+ badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
+ badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
+ badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
+ badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
+ badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
+ badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
+ badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
+ badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
+ badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
+ badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
+ badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
+ badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
+ badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
+ badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
+ badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
+ badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
+ badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
+ badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
+ badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
+ badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
+ badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
+ badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
+ badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
+ badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
+ badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
+ badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
+ badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
+ badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
+ badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
+ badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
+ badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
+ badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
+ badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
+ badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
+ badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
+ badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
+ badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
+ badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
+ badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
+ badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign
+ badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
+ badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
+ badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
+ badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
+ badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
+ badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
+ badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
+ badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
+ }
+
+ /**
+ * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove it. For each
+ * instance of a bare {@literal &}, replace it with {@literal &
+ * } XML only allows 4 entities: {@literal &}, {@literal "}, {@literal <} and {@literal >}.
+ *
+ * @param broken
+ * the string to handle entities
+ * @return the string with entities appropriately fixed up
+ */
+ static public String cleanAllEntities(final String broken) {
+ if (broken == null) { return null; }
+
+ String working = invalidControlCharPattern.matcher(broken).replaceAll("");
+ working = invalidCharacterPattern.matcher(working).replaceAll("");
+
+ int cleanfrom = 0;
+
+ while (true) {
+ int amp = working.indexOf('&', cleanfrom);
+ // If there are no more amps then we are done
+ if (amp == -1) {
+ break;
+ }
+ // Skip references of the kind ddd;
+ if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
+ cleanfrom = working.indexOf(';', amp) + 1;
+ continue;
+ }
+ int i = amp + 1;
+ while (true) {
+ // if we are at the end of the string then just escape the '&';
+ if (i >= working.length()) { return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
+ }
+ // if we have come to a ; then we have an entity
+ // If it is something that xml can't handle then replace it.
+ final char c = working.charAt(i);
+ if (c == ';') {
+ final String entity = working.substring(amp, i + 1);
+ final String replace = handleEntity(entity);
+ working = working.substring(0, amp) + replace + working.substring(i + 1);
+ break;
+ }
+ // Did we end an entity without finding a closing ;
+ // Then treat it as an '&' that needs to be replaced with &
+ if (!Character.isLetterOrDigit(c)) {
+ working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
+ amp = i + 4; // account for the 4 extra characters
+ break;
+ }
+ i++;
+ }
+ cleanfrom = amp + 1;
+ }
+
+ if (Pattern.compile("<<").matcher(working).find()) {
+ working = working.replaceAll("<<", "<<");
+ }
+
+ if (Pattern.compile(">>").matcher(working).find()) {
+ working = working.replaceAll(">>", ">>");
+ }
+
+ return working;
+ }
+
+ /**
+ * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only allows 4 entities:
+ * &, ", < and >.
+ *
+ * @param entity
+ * the entity to be replaced
+ * @return the substitution for the entity, either itself, the unicode equivalent or an empty string.
+ */
+ private static String handleEntity(final String entity) {
+ if (goodEntities.contains(entity)) { return entity; }
+
+ final String replace = badEntities.get(entity);
+ if (replace != null) { return replace; }
+
+ return replace != null ? replace : "";
+ }
+}
diff --git a/dhp-applications/dhp-collector-worker/src/main/resources/application.properties b/dhp-applications/dhp-collector-worker/src/main/resources/application.properties
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/main/resources/application.properties
@@ -0,0 +1 @@
+
diff --git a/dhp-applications/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-applications/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java
new file mode 100644
index 000000000..24f78f318
--- /dev/null
+++ b/dhp-applications/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java
@@ -0,0 +1,45 @@
+package eu.dnetlib.collector.worker;
+
+import static org.junit.Assert.assertNotNull;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.CommandLineRunner;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.context.ApplicationContext;
+import org.springframework.test.context.junit4.SpringRunner;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.collector.worker.model.ApiDescriptor;
+import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator;
+
+@RunWith(SpringRunner.class)
+@SpringBootTest
+public class DnetCollectorWorkerApplicationTests {
+
+ @Autowired
+ private ApplicationContext ctx;
+
+ @Test
+ public void testFindPlugin() throws Exception {
+ final CollectorPluginEnumerator collectorPluginEnumerator = ctx.getBean(CollectorPluginEnumerator.class);
+ assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai"));
+ assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI"));
+ }
+
+ @Test
+ public void testCollectionOAI() throws Exception {
+ final ApiDescriptor api = new ApiDescriptor();
+ api.setId("oai");
+ api.setProtocol("oai");
+ api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai");
+ api.getParams().put("format", "oai_dc");
+
+ ObjectMapper mapper = new ObjectMapper();
+
+ System.out.println(mapper.writeValueAsString(api));
+ }
+
+}
diff --git a/dhp-applications/dhp-mdstore-manager-app/pom.xml b/dhp-applications/dhp-mdstore-manager-app/pom.xml
index 292a3ca34..db3d702ce 100644
--- a/dhp-applications/dhp-mdstore-manager-app/pom.xml
+++ b/dhp-applications/dhp-mdstore-manager-app/pom.xml
@@ -3,30 +3,31 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
- eu.dnetlib.dhp
dhp-mdstore-manager-app
1.1.0-SNAPSHOT
- org.springframework.boot
- spring-boot-starter-parent
- 2.1.3.RELEASE
-
+ eu.dnetlib.dhp
+ dhp-applications
+ 1.0.0-SNAPSHOT
+ ../
-
+
+
@@ -84,16 +85,6 @@
-
-
-
- org.springframework.boot
- spring-boot-maven-plugin
-
- true
-
-
-
-
+
diff --git a/dhp-applications/dhp-mdstore-manager-app/src/main/resources/application.properties b/dhp-applications/dhp-mdstore-manager-app/src/main/resources/application.properties
index 792c39865..46f38ca64 100644
--- a/dhp-applications/dhp-mdstore-manager-app/src/main/resources/application.properties
+++ b/dhp-applications/dhp-mdstore-manager-app/src/main/resources/application.properties
@@ -2,8 +2,8 @@ spring.main.banner-mode = console
logging.level.root = INFO
spring.datasource.url=jdbc:postgresql://localhost:5432/mdstoremanager
-spring.datasource.username=
-spring.datasource.password=
+spring.datasource.username=dnet
+spring.datasource.password=dnetPwd
spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect
diff --git a/dhp-applications/pom.xml b/dhp-applications/pom.xml
index 51d4047ac..7e52fc76d 100644
--- a/dhp-applications/pom.xml
+++ b/dhp-applications/pom.xml
@@ -5,11 +5,46 @@
eu.dnetlib.dhp
dhp
1.0.0-SNAPSHOT
+ ../
dhp-applications
pom
dhp-mdstore-manager-app
+ dhp-collector-worker
+
+
+
+
+ org.springframework.boot
+ spring-boot-dependencies
+ 2.1.3.RELEASE
+ pom
+ import
+
+
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-common
+ 1.0.0-SNAPSHOT
+
+
+
+
+
+
+ org.springframework.boot
+ spring-boot-maven-plugin
+
+ true
+
+
+
+
+
diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 3b7b6aca3..2de3dc310 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -8,6 +8,7 @@
eu.dnetlib.dhp
dhp
1.0.0-SNAPSHOT
+ ../
dhp-common
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java
new file mode 100644
index 000000000..3be1e1386
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java
@@ -0,0 +1,105 @@
+package eu.dnetlib.dhp.model.mdstore;
+
+import eu.dnetlib.dhp.utils.DHPUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.Serializable;
+
+
+/**
+ * This class models a record inside the new MetadataStore
+ *
+ */
+public class MetadataRecord implements Serializable {
+
+ private String id;
+
+ private String originalId;
+
+ private String encoding;
+
+ private Provenance provenance;
+
+ private String body;
+
+ private long dateOfCollection;
+
+
+ public MetadataRecord() {
+ this.dateOfCollection = System.currentTimeMillis();
+ }
+
+ public MetadataRecord(String originalId, String encoding, Provenance provenance, String body, long dateOfCollection) {
+
+ this.originalId = originalId;
+ this.encoding = encoding;
+ this.provenance = provenance;
+ this.body = body;
+ this.dateOfCollection = dateOfCollection;
+ this.id = DHPUtils.generateIdentifier(originalId,this.provenance.getNsPrefix());
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+
+ public String getOriginalId() {
+ return originalId;
+ }
+
+ public void setOriginalId(String originalId) {
+ this.originalId = originalId;
+ }
+
+ public String getEncoding() {
+ return encoding;
+ }
+
+ public void setEncoding(String encoding) {
+ this.encoding = encoding;
+ }
+
+ public Provenance getProvenance() {
+ return provenance;
+ }
+
+ public void setProvenance(Provenance provenance) {
+ this.provenance = provenance;
+ }
+
+
+ public String getBody() {
+ return body;
+ }
+
+ public void setBody(String body) {
+ this.body = body;
+ }
+
+ public long getDateOfCollection() {
+ return dateOfCollection;
+ }
+
+ public void setDateOfCollection(long dateOfCollection) {
+ this.dateOfCollection = dateOfCollection;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof MetadataRecord)) {
+ return false;
+ }
+ return ((MetadataRecord) o).getId().equalsIgnoreCase(id);
+
+ }
+
+ @Override
+ public int hashCode() {
+ return id.hashCode();
+ }
+}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java
new file mode 100644
index 000000000..de67281f2
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java
@@ -0,0 +1,56 @@
+package eu.dnetlib.dhp.model.mdstore;
+
+import java.io.Serializable;
+
+
+/**
+ * @author Sandro La Bruzzo
+ *
+ * Provenace class models the provenance of the record in the metadataStore
+ * It contains the identifier and the name of the datasource that gives the
+ * record
+ *
+ */
+public class Provenance implements Serializable {
+
+ private String datasourceId;
+
+
+ private String datasourceName;
+
+ private String nsPrefix;
+
+ public Provenance() {
+
+ }
+
+ public Provenance(String datasourceId, String datasourceName, String nsPrefix) {
+ this.datasourceId = datasourceId;
+ this.datasourceName = datasourceName;
+ this.nsPrefix = nsPrefix;
+ }
+
+ public String getDatasourceId() {
+ return datasourceId;
+ }
+
+ public void setDatasourceId(String datasourceId) {
+ this.datasourceId = datasourceId;
+ }
+
+ public String getDatasourceName() {
+ return datasourceName;
+ }
+
+ public void setDatasourceName(String datasourceName) {
+ this.datasourceName = datasourceName;
+ }
+
+ public String getNsPrefix() {
+ return nsPrefix;
+ }
+
+ public void setNsPrefix(String nsPrefix) {
+ this.nsPrefix = nsPrefix;
+ }
+}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
new file mode 100644
index 000000000..d37272d21
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
@@ -0,0 +1,25 @@
+package eu.dnetlib.dhp.utils;
+
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.lang3.StringUtils;
+
+import java.security.MessageDigest;
+
+public class DHPUtils {
+
+ public static String md5(final String s) {
+ try {
+ final MessageDigest md = MessageDigest.getInstance("MD5");
+ md.update(s.getBytes("UTF-8"));
+ return new String(Hex.encodeHex(md.digest()));
+ } catch (final Exception e) {
+ System.err.println("Error creating id");
+ return null;
+ }
+ }
+
+ public static String generateIdentifier(final String originalId, final String nsPrefix) {
+ return String.format("%s::%s",nsPrefix, DHPUtils.md5(originalId));
+ }
+
+}
diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java
new file mode 100644
index 000000000..4515429ea
--- /dev/null
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java
@@ -0,0 +1,15 @@
+package eu.dnetlib.dhp.model.mdstore;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertTrue;
+
+public class MetadataRecordTest {
+
+ @Test
+ public void getTimestamp() {
+
+ MetadataRecord r = new MetadataRecord();
+ assertTrue(r.getDateOfCollection() >0);
+ }
+}
\ No newline at end of file
diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml
index 6ead99f42..5c5b804a7 100644
--- a/dhp-schemas/pom.xml
+++ b/dhp-schemas/pom.xml
@@ -8,6 +8,7 @@
eu.dnetlib.dhp
dhp
1.0.0-SNAPSHOT
+ ../
dhp-schemas
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java
index d49846b23..719e1f134 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java
@@ -50,10 +50,6 @@ public class GenerateNativeStoreSparkJob {
}
}
-
-
-
-
public static void main(String[] args) throws Exception {
Options options = new Options();
@@ -113,7 +109,7 @@ public class GenerateNativeStoreSparkJob {
final SparkSession spark = SparkSession
.builder()
.appName("GenerateNativeStoreSparkJob")
- .master("yarn")
+ .master("local")
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java
index 2e1d53aa4..926b13bba 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java
@@ -1,4 +1,51 @@
package eu.dnetlib.dhp.collection;
+import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
+import eu.dnetlib.dhp.transformation.TransformFunction;
+import org.apache.commons.cli.*;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoder;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.util.LongAccumulator;
+
public class TransformSparkJobNode {
+
+ public static void main(String[] args) throws ParseException {
+ Options options = new Options();
+
+ options.addOption(Option.builder("i")
+ .longOpt("input")
+ .required(true)
+ .desc("input path of the sequence file")
+ .hasArg() // This option has an argument.
+ .build());
+ options.addOption(Option.builder("o")
+ .longOpt("output")
+ .required(true)
+ .desc("output path of the mdstore")
+ .hasArg()
+ .build());
+
+ final CommandLineParser parser = new DefaultParser();
+ final CommandLine cmd = parser.parse( options, args);
+
+ final String inputPath = cmd.getOptionValue("i");
+ final String outputPath = cmd.getOptionValue("o");
+
+ final SparkSession spark = SparkSession
+ .builder()
+ .appName("GenerateNativeStoreSparkJob")
+ .master("local")
+ .getOrCreate();
+
+ final Encoder encoder = Encoders.bean(MetadataRecord.class);
+ final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder);
+ final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems");
+
+ final TransformFunction mfunc = new TransformFunction(totalItems);
+ mdstoreInput.map(mfunc, encoder).write().format("parquet").save(outputPath);
+ System.out.println("totalItems = " + totalItems.value());
+
+ }
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java
index 393db48ba..2380c673a 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java
@@ -1,13 +1,21 @@
-package eu.dnetlib.dhp.collection;
+package eu.dnetlib.dhp.transformation;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.util.LongAccumulator;
public class TransformFunction implements MapFunction {
+ private final LongAccumulator totalItems;
+
+ public TransformFunction(LongAccumulator totalItems) {
+ this.totalItems= totalItems;
+ }
+
@Override
public MetadataRecord call(MetadataRecord value) throws Exception {
- return null;
+ totalItems.add(1);
+ return value;
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/oozie/workflows/oozie_collection_workflows.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/oozie/workflows/oozie_collection_workflows.xml
index 0deda423b..c6279e7c5 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/oozie/workflows/oozie_collection_workflows.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/oozie/workflows/oozie_collection_workflows.xml
@@ -20,7 +20,7 @@
A json encoding of the Datasource Info
- identifierPath
+ identifierXPath
An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java
index 61bb472ce..3bed60586 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java
@@ -15,13 +15,23 @@ public class CollectionJobTest {
@Test
public void test () throws Exception {
Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
-
-
-// GenerateNativeStoreSparkJob.main(new String[] {"XML", ""+System.currentTimeMillis(), new ObjectMapper().writeValueAsString(provenance), "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']","/home/sandro/Downloads/mdstore_oai","/home/sandro/Downloads/mdstore_result"});
+ GenerateNativeStoreSparkJob.main(new String[] {"-e", "XML","-d", ""+System.currentTimeMillis(),"-p", new ObjectMapper().writeValueAsString(provenance), "-x","./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']","-i","/home/sandro/Downloads/oai_1","-o","/home/sandro/Downloads/mdstore_result"});
System.out.println(new ObjectMapper().writeValueAsString(provenance));
}
+ @Test
+ public void transformTest () throws Exception {
+
+ TransformSparkJobNode.main(new String[]{"-o","/home/sandro/Downloads/mdstore_cleande","-i","/home/sandro/Downloads/mdstore_result"});
+
+
+
+
+ }
+
+
+
@Test
public void testGenerationMetadataRecord() throws Exception {
@@ -29,6 +39,7 @@ public class CollectionJobTest {
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
+ assert record != null;
System.out.println(record.getId());
System.out.println(record.getOriginalId());
@@ -42,9 +53,11 @@ public class CollectionJobTest {
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
+ assert record != null;
record.setBody("ciao");
+ assert record1 != null;
record1.setBody("mondo");
- Assert.assertTrue(record.equals(record1));
+ Assert.assertEquals(record, record1);
}
diff --git a/dhp-workflows/dhp-collector-worker/target/maven-archiver/pom.properties b/dhp-workflows/dhp-collector-worker/target/maven-archiver/pom.properties
deleted file mode 100644
index ef502424b..000000000
--- a/dhp-workflows/dhp-collector-worker/target/maven-archiver/pom.properties
+++ /dev/null
@@ -1,4 +0,0 @@
-#Created by Apache Maven 3.6.0
-version=1.0.0
-groupId=eu.dnetlib
-artifactId=dhp-collector-worker
diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml
index f892307aa..f8fcde2a8 100644
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@@ -5,11 +5,11 @@
eu.dnetlib.dhp
dhp
1.0.0-SNAPSHOT
+ ../
dhp-workflows
pom
- dhp-collector-worker
dhp-aggregation
diff --git a/pom.xml b/pom.xml
index fde394c83..11544cc60 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,7 +23,7 @@
dhp-common
dhp-workflows
- dhp-applications
+ dhp-applications
@@ -37,9 +37,9 @@
- scm:git:ssh://git@github.com/???
- scm:git:ssh://git@github.com/???.git
- https://github.com/???
+ scm:git:ssh://git@github.com/dnet-team/dnet-hadoop.git
+ scm:git:ssh://git@github.com:dnet-team/dnet-hadoop.git
+ https://github.com/dnet-team/dnet-hadoop
HEAD
@@ -135,22 +135,16 @@
commons-cli
1.4
-
-
-
-
target
target/classes
${project.artifactId}-${project.version}
target/test-classes
-
-
org.apache.maven.plugins
maven-compiler-plugin
@@ -191,7 +185,6 @@
true
-
org.apache.maven.plugins
maven-javadoc-plugin
@@ -200,27 +193,21 @@
true
-
org.apache.maven.plugins
maven-dependency-plugin
3.0.0
-
-
-
-
org.apache.maven.plugins
maven-release-plugin
2.5.3
-