Merge branch 'stable_ids' into openorgswf

2021-03-29 16:39:07 +02:00 · 2021-03-29 16:39:07 +02:00 · 2709d08fc2
parent f446580e9f 3becaa5539
commit 2709d08fc2
167 changed files with 14014 additions and 2657 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,8 @@
 *.iws
 *~
 .vscode
+.metals
+.bloop
 .classpath
 /*/.classpath
 /*/*/.classpath
@ -24,4 +26,5 @@
 spark-warehouse
 /**/job-override.properties
 /**/*.log
+/**/.factorypath

--- a/100.patch
+++ b/100.patch
@ -0,0 +1,757 @@
+From c5fbad8093ca27deebf1b5fd5ffd39e1877c533d Mon Sep 17 00:00:00 2001
+From: antleb <antleb@di.uoa.gr>
+Date: Thu, 4 Mar 2021 00:42:21 +0200
+Subject: [PATCH 1/8] Contexts are now downloaded instead of using the
+ stats_ext db
+
+---
+ .../dhp/oa/graph/stats/oozie_app/contexts.sh  | 33 +++++++++++++++++++
+ .../graph/stats/oozie_app/scripts/step10.sql  | 13 --------
+ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 17 ++++++++++
+ 3 files changed, 50 insertions(+), 13 deletions(-)
+ create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+new file mode 100644
+index 00000000..f06a43bb
+--- /dev/null
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+CONTEXT_API=$1
+TARGET_DB=$2
+
+TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6`
+
+echo "Downloading context data"
+curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
+cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
+cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
+cat contexts.csv  | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv
+cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv
+
+echo "uploading context data to hdfs"
+hdfs dfs -mkdir ${TMP}
+hdfs dfs -copyFromLocal contexts.csv ${TMP}
+hdfs dfs -copyFromLocal categories.csv ${TMP}
+hdfs dfs -copyFromLocal concepts.csv ${TMP}
+hdfs dfs -chmod -R 777 ${TMP}
+
+echo "Creating and populating impala tables"
+impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';"
+impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';"
+impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';"
+impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;"
+impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;"
+impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;"
+
+echo "Cleaning up"
+hdfs dfs -rm -f -r -skipTrash ${TMP}
+
+echo "Finito!"
+\ No newline at end of file
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
+index 6c96317e..77fbd3b1 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
+@@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
+ SELECT *
+ FROM ${external_stats_db_name}.rndexpediture;
+ 
+-CREATE OR REPLACE VIEW ${stats_db_name}.context AS
+-SELECT *
+-FROM ${external_stats_db_name}.context;
+-
+-CREATE OR REPLACE VIEW ${stats_db_name}.category AS
+-SELECT *
+-FROM ${external_stats_db_name}.category;
+-
+-CREATE OR REPLACE VIEW ${stats_db_name}.concept AS
+-SELECT *
+-FROM ${external_stats_db_name}.concept;
+-
+-
+ ------------------------------------------------------------------------------------------------
+ ------------------------------------------------------------------------------------------------
+ -- Creation date of the database
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+index 9c16f149..afb10c41 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+@@ -41,6 +41,10 @@
+             <name>hive_timeout</name>
+             <description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
+         </property>
+        <property>
+            <name>context_api_url</name>
+            <description>the base url of the context api (https://services.openaire.eu/openaire)</description>
+        </property>
+     </parameters>
+ 
+     <global>
+@@ -263,6 +267,19 @@
+         <ok to="Step19"/>
+         <error to="Kill"/>
+     </action>
+
+    <action name="Step17">
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>contexts.sh</exec>
+            <argument>${context_api_url}</argument>
+            <argument>${stats_db_name}</argument>
+            <file>contexts.sh</file>
+        </shell>
+        <ok to="step20-createMonitorDB"/>
+        <error to="Kill"/>
+    </action>
+         
+     <action name="Step19">
+         <shell xmlns="uri:oozie:shell-action:0.1">
+-- 
+2.17.1
+
+
+From 6147ee495053634436abe822aaf9ba909813d8c4 Mon Sep 17 00:00:00 2001
+From: antleb <antleb@di.uoa.gr>
+Date: Fri, 5 Mar 2021 14:12:18 +0200
+Subject: [PATCH 2/8] assigning correctly hive contexts to concepts
+
+---
+ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh    | 7 +++++--
+ .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql | 5 ++++-
+ .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql | 5 ++++-
+ .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql | 5 ++++-
+ .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql | 5 ++++-
+ 5 files changed, 21 insertions(+), 6 deletions(-)
+
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+index f06a43bb..6788f88b 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+@@ -9,8 +9,8 @@ echo "Downloading context data"
+ curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
+ cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
+ cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
+-cat contexts.csv  | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv
+-cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv
+cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv
+cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
+ 
+ echo "uploading context data to hdfs"
+ hdfs dfs -mkdir ${TMP}
+@@ -29,5 +29,8 @@ impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.
+ 
+ echo "Cleaning up"
+ hdfs dfs -rm -f -r -skipTrash ${TMP}
+rm concepts.csv
+rm categories.csv
+rm contexts.csv
+ 
+ echo "Finito!"
+\ No newline at end of file
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+index 62a15856..75b24b18 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+@@ -47,7 +47,10 @@ from ${openaire_db_name}.publication p
+ where p.datainfo.deletedbyinference = false;
+ 
+ CREATE TABLE ${stats_db_name}.publication_concepts AS
+-SELECT substr(p.id, 4) as id, contexts.context.id as concept
+SELECT substr(p.id, 4) as id, case
+    when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
+    when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
+    when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
+ from ${openaire_db_name}.publication p
+          LATERAL VIEW explode(p.context) contexts as context
+ where p.datainfo.deletedbyinference = false;
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+index dcd5ad85..540cc03a 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.dataset p
+ where p.datainfo.deletedbyinference = false;
+ 
+ CREATE TABLE ${stats_db_name}.dataset_concepts AS
+-SELECT substr(p.id, 4) as id, contexts.context.id as concept
+SELECT substr(p.id, 4) as id, case
+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
+                                  when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
+ from ${openaire_db_name}.dataset p
+          LATERAL VIEW explode(p.context) contexts as context
+ where p.datainfo.deletedbyinference = false;
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+index fd5390e6..54345e07 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.software p
+ where p.datainfo.deletedbyinference = false;
+ 
+ CREATE TABLE ${stats_db_name}.software_concepts AS
+-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept
+SELECT substr(p.id, 4) as id, case
+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
+                                  when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
+ FROM ${openaire_db_name}.software p
+          LATERAL VIEW explode(p.context) contexts AS context
+ where p.datainfo.deletedbyinference = false;
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+index b359b596..36ad5d92 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+@@ -52,7 +52,10 @@ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.
+ where p.datainfo.deletedbyinference = false;
+ 
+ CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS
+-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept
+SELECT substr(p.id, 4) as id, case
+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
+                                  when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
+ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
+ where p.datainfo.deletedbyinference = false;
+ 
+-- 
+2.17.1
+
+
+From f40c150a0d549e2dbcfd42ecf81e17ad4b505391 Mon Sep 17 00:00:00 2001
+From: antleb <antleb@di.uoa.gr>
+Date: Sat, 6 Mar 2021 00:35:57 +0200
+Subject: [PATCH 3/8] fixed steps...
+
+---
+ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml      | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+index afb10c41..2184cb8a 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+@@ -264,7 +264,7 @@
+             <param>stats_db_name=${stats_db_name}</param>
+             <param>openaire_db_name=${openaire_db_name}</param>
+         </hive2>
+-        <ok to="Step19"/>
+        <ok to="Step17"/>
+         <error to="Kill"/>
+     </action>
+ 
+@@ -277,7 +277,7 @@
+             <argument>${stats_db_name}</argument>
+             <file>contexts.sh</file>
+         </shell>
+-        <ok to="step20-createMonitorDB"/>
+        <ok to="step19"/>
+         <error to="Kill"/>
+     </action>
+         
+-- 
+2.17.1
+
+
+From fa1ec5b5e9b6038b3b565422af5c6406f21220d3 Mon Sep 17 00:00:00 2001
+From: antleb <antleb@di.uoa.gr>
+Date: Wed, 10 Mar 2021 14:05:58 +0200
+Subject: [PATCH 4/8] fixed typo...
+
+---
+ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml        | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+index 2184cb8a..321500e2 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+@@ -277,7 +277,7 @@
+             <argument>${stats_db_name}</argument>
+             <file>contexts.sh</file>
+         </shell>
+-        <ok to="step19"/>
+        <ok to="Step19"/>
+         <error to="Kill"/>
+     </action>
+         
+-- 
+2.17.1
+
+
+From 3c75a050443942b632cf8469b5af16a8c61e7569 Mon Sep 17 00:00:00 2001
+From: antleb <antleb@di.uoa.gr>
+Date: Fri, 12 Mar 2021 13:47:04 +0200
+Subject: [PATCH 5/8] fixed a ton of typos
+
+---
+ .../scripts/computeProductionStats.sql        |  8 -------
+ .../stats/oozie_app/updateProductionViews.sh  | 18 ++++++++++++++++
+ .../dhp/oa/graph/stats/oozie_app/contexts.sh  | 21 ++++++++++++-------
+ 3 files changed, 32 insertions(+), 15 deletions(-)
+ delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
+ create mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
+
+diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
+deleted file mode 100644
+index 34e48a18..00000000
+--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
+++ /dev/null
+@@ -1,8 +0,0 @@
+-------------------------------------------------------
+-------------------------------------------------------
+--- Impala table statistics - Needed to make the tables
+--- visible for impala
+-------------------------------------------------------
+-------------------------------------------------------
+-
+-INVALIDATE METADATA ${stats_db_name};
+diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
+new file mode 100644
+index 00000000..57acb2ee
+--- /dev/null
+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
+@@ -0,0 +1,18 @@
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+
+export SOURCE=$1
+export SHADOW=$2
+
+echo "Updating shadow database"
+impala-shell -d ${SOURCE} -q "invalidate metadata"
+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f -
+impala-shell -q "create database if not exists ${SHADOW}"
+impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
+echo "Shadow db ready!"
+\ No newline at end of file
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+index 6788f88b..c28be50d 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+@@ -1,4 +1,10 @@
+-#!/usr/bin/env bash
+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+if ! [ -L $link_folder ]
+then
+    rm -Rf "$link_folder"
+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+fi
+ 
+ CONTEXT_API=$1
+ TARGET_DB=$2
+@@ -20,12 +26,13 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP}
+ hdfs dfs -chmod -R 777 ${TMP}
+ 
+ echo "Creating and populating impala tables"
+-impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';"
+-impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';"
+-impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';"
+-impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;"
+-impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;"
+-impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;"
+impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
+impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
+impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
+impala-shell -d ${TARGET_DB} -q "invalidate metadata"
+impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
+impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
+impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
+ 
+ echo "Cleaning up"
+ hdfs dfs -rm -f -r -skipTrash ${TMP}
+-- 
+2.17.1
+
+
+From 236435b47010ea1ab94c3f018dcf278f5d2c44aa Mon Sep 17 00:00:00 2001
+From: antleb <antleb@di.uoa.gr>
+Date: Fri, 12 Mar 2021 14:11:21 +0200
+Subject: [PATCH 6/8] following redirects
+
+---
+ .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh     | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+index c28be50d..29b225e3 100644
+--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
+@@ -12,9 +12,9 @@ TARGET_DB=$2
+ TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6`
+ 
+ echo "Downloading context data"
+-curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
+-cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
+-cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
+curl -L ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
+cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
+cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
+ cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv
+ cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
+ 
+-- 
+2.17.1
+
+
+From 60ebdf2dbe704733809f401df70bffcf49cede29 Mon Sep 17 00:00:00 2001
+From: antleb <antleb@di.uoa.gr>
+Date: Fri, 12 Mar 2021 16:34:53 +0200
+Subject: [PATCH 7/8] update promote wf to support monitor&production
+
+---
+ .../oa/graph/stats/oozie_app/impala-shell.sh  |  18 --
+ .../scripts/updateProductionViews.sql         | 207 ------------------
+ 2 files changed, 225 deletions(-)
+ delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
+ delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
+
+diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
+deleted file mode 100644
+index 70112dc7..00000000
+--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
+++ /dev/null
+@@ -1,18 +0,0 @@
+-export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
+-export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
+-if ! [ -L $link_folder ]
+-then
+-    rm -Rf "$link_folder"
+-    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
+-fi
+-
+-echo "Getting file from " $3
+-hdfs dfs -copyToLocal $3
+-
+-echo "Running impala shell make the new database visible"
+-impala-shell -q "INVALIDATE METADATA;"
+-
+-echo "Running impala shell to compute new table stats"
+-impala-shell -d $1 -f $2
+-echo "Impala shell finished"
+-rm $2
+diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
+deleted file mode 100644
+index 48f8d58f..00000000
+--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
+++ /dev/null
+@@ -1,207 +0,0 @@
+-------------------------------------------------------
+-------------------------------------------------------
+--- Shadow schema table exchange
+-------------------------------------------------------
+-------------------------------------------------------
+-
+--- Dropping old views
+-DROP VIEW IF EXISTS ${stats_db_production_name}.category;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.concept;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.context;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.country;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.funder;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.fundref;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.organization;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.project;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources;
+-DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics;
+-
+-
+--- Creating the shadow database, in case it doesn't exist
+-CREATE database IF NOT EXISTS ${stats_db_production_name};
+-
+--- Creating new views
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources;
+-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics;
+-- 
+2.17.1
+
+
+From 0ba0a6b9dac25f5ec73e8eafefbf7f91442ad1c5 Mon Sep 17 00:00:00 2001
+From: antleb <antleb@di.uoa.gr>
+Date: Fri, 12 Mar 2021 16:42:59 +0200
+Subject: [PATCH 8/8] update promote wf to support monitor&production
+
+---
+ .../stats/oozie_app/updateProductionViews.sh  | 14 +++----
+ .../dhp/oa/graph/stats/oozie_app/workflow.xml | 37 ++++++++++++-------
+ 2 files changed, 29 insertions(+), 22 deletions(-)
+
+diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
+index 57acb2ee..3e510e87 100644
+--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
+@@ -7,12 +7,10 @@ then
+ fi
+ 
+ export SOURCE=$1
+-export SHADOW=$2
+export PRODUCTION=$2
+ 
+-echo "Updating shadow database"
+-impala-shell -d ${SOURCE} -q "invalidate metadata"
+-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f -
+-impala-shell -q "create database if not exists ${SHADOW}"
+-impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
+-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
+-echo "Shadow db ready!"
+\ No newline at end of file
+echo "Updating ${PRODUCTION} database"
+impala-shell -q "create database if not exists ${PRODUCTION}"
+impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
+echo "Production db ready!"
+\ No newline at end of file
+diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+index d744f18d..0d8ff7ee 100644
+--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
+@@ -6,7 +6,15 @@
+         </property>
+         <property>
+             <name>stats_db_production_name</name>
+-            <description>the name of the production schema</description>
+            <description>the name of the public production schema</description>
+        </property>
+        <property>
+            <name>monitor_db_name</name>
+            <description>the monitor database name</description>
+        </property>
+        <property>
+            <name>monitor_db_production_name</name>
+            <description>the name of the monitor public database</description>
+         </property>
+         <property>
+             <name>stats_tool_api_url</name>
+@@ -48,25 +56,26 @@
+     </kill>
+ 
+     <action name="updateProductionViews">
+-        <hive2 xmlns="uri:oozie:hive2-action:0.1">
+-            <jdbc-url>${hive_jdbc_url}</jdbc-url>
+-            <script>scripts/updateProductionViews.sql</script>
+-			<param>stats_db_name=${stats_db_name}</param>
+-			<param>stats_db_production_name=${stats_db_production_name}</param>
+-        </hive2>
+-        <ok to="computeProductionStats"/>
+        <shell xmlns="uri:oozie:shell-action:0.1">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <exec>updateProductionViews.sh</exec>
+            <argument>${stats_db_name}</argument>
+            <argument>${stats_db_production_name}</argument>
+            <file>updateProductionViews.sh</file>
+        </shell>
+        <ok to="updateMonitorViews"/>
+         <error to="Kill"/>
+     </action>
+ 
+-    <action name="computeProductionStats">
+    <action name="updateMonitorViews">
+         <shell xmlns="uri:oozie:shell-action:0.1">
+             <job-tracker>${jobTracker}</job-tracker>
+             <name-node>${nameNode}</name-node>
+-            <exec>impala-shell.sh</exec>
+-            <argument>${stats_db_production_name}</argument>
+-            <argument>computeProductionStats.sql</argument>
+-            <argument>${wf:appPath()}/scripts/computeProductionStats.sql</argument>
+-            <file>impala-shell.sh</file>
+            <exec>updateProductionViews.sh</exec>
+            <argument>${monitor_db_name}</argument>
+            <argument>${monitor_db_production_name}</argument>
+            <file>updateProductionViews.sh</file>
+         </shell>
+         <ok to="promoteCache"/>
+         <error to="Kill"/>
+-- 
+2.17.1
+
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -6,7 +6,8 @@
 		<groupId>eu.dnetlib.dhp</groupId>
 		<artifactId>dhp</artifactId>
 		<version>1.2.4-SNAPSHOT</version>
-		<relativePath>../</relativePath>
+		<relativePath>../pom.xml</relativePath>
+
 	</parent>

 	<artifactId>dhp-common</artifactId>
@ -29,12 +30,6 @@
 			<artifactId>spark-sql_2.11</artifactId>
 		</dependency>

-		<dependency>
-			<groupId>eu.dnetlib.dhp</groupId>
-			<artifactId>dhp-schemas</artifactId>
-			<version>${project.version}</version>
-		</dependency>
-
 		<dependency>
 			<groupId>commons-cli</groupId>
 			<artifactId>commons-cli</artifactId>
@ -59,11 +54,6 @@
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
 		</dependency>
-		<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
-		<dependency>
-			<groupId>com.rabbitmq</groupId>
-			<artifactId>amqp-client</artifactId>
-		</dependency>
 		<dependency>
 			<groupId>net.sf.saxon</groupId>
 			<artifactId>Saxon-HE</artifactId>
@ -104,6 +94,21 @@
 			<artifactId>dnet-pace-core</artifactId>
 		</dependency>

+		<dependency>
+			<groupId>org.apache.httpcomponents</groupId>
+			<artifactId>httpclient</artifactId>
+		</dependency>
+
+		<dependency>
+			<groupId>org.mongodb</groupId>
+			<artifactId>mongo-java-driver</artifactId>
+		</dependency>
+
+		<dependency>
+			<groupId>eu.dnetlib.dhp</groupId>
+			<artifactId>dhp-schemas</artifactId>
+			<version>${project.version}</version>
+		</dependency>
 	</dependencies>

 </project>
--- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java
+++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java
@ -2,12 +2,16 @@
 package eu.dnetlib.data.mdstore.manager.common.model;

 import java.io.Serializable;
+import java.util.Date;
+import java.util.Objects;
 import java.util.UUID;

 import javax.persistence.Column;
 import javax.persistence.Entity;
 import javax.persistence.Id;
 import javax.persistence.Table;
+import javax.persistence.Temporal;
+import javax.persistence.TemporalType;

@Entity
@Table(name = "mdstores")
@ -38,6 +42,13 @@ public class MDStore implements Serializable {
 	@Column(name = "api_id")
 	private String apiId;

+	@Column(name = "hdfs_path")
+	private String hdfsPath;
+
+	@Column(name = "creation_date")
+	@Temporal(TemporalType.TIMESTAMP)
+	private Date creationDate;
+
 	public String getId() {
 		return id;
 	}
@ -94,9 +105,28 @@ public class MDStore implements Serializable {
 		this.apiId = apiId;
 	}

+	public String getHdfsPath() {
+		return hdfsPath;
+	}
+
+	public void setHdfsPath(final String hdfsPath) {
+		this.hdfsPath = hdfsPath;
+	}
+
+	public Date getCreationDate() {
+		return creationDate;
+	}
+
+	public void setCreationDate(final Date creationDate) {
+		this.creationDate = creationDate;
+	}
+
 	public static MDStore newInstance(
-		final String format, final String layout, final String interpretation) {
-		return newInstance(format, layout, interpretation, null, null, null);
+		final String format,
+		final String layout,
+		final String interpretation,
+		final String hdfsBasePath) {
+		return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath);
 	}

 	public static MDStore newInstance(
@ -105,15 +135,48 @@ public class MDStore implements Serializable {
 		final String interpretation,
 		final String dsName,
 		final String dsId,
-		final String apiId) {
+		final String apiId,
+		final String hdfsBasePath) {
+
+		final String mdId = "md-" + UUID.randomUUID();
+
 		final MDStore md = new MDStore();
-		md.setId("md-" + UUID.randomUUID());
+		md.setId(mdId);
 		md.setFormat(format);
 		md.setLayout(layout);
 		md.setInterpretation(interpretation);
+		md.setCreationDate(new Date());
 		md.setDatasourceName(dsName);
 		md.setDatasourceId(dsId);
 		md.setApiId(apiId);
+		md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId));
+
 		return md;
 	}
+
+	@Override
+	public String toString() {
+		return String
+			.format(
+				"MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]",
+				id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate);
+	}
+
+	@Override
+	public int hashCode() {
+		return Objects.hash(id);
+	}
+
+	@Override
+	public boolean equals(final Object obj) {
+		if (this == obj) {
+			return true;
+		}
+		if (!(obj instanceof MDStore)) {
+			return false;
+		}
+		final MDStore other = (MDStore) obj;
+		return Objects.equals(id, other.id);
+	}
+
 }
--- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java
+++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java
@ -2,6 +2,7 @@
 package eu.dnetlib.data.mdstore.manager.common.model;

 import java.io.Serializable;
+import java.util.Objects;

 import javax.persistence.Column;
 import javax.persistence.Entity;
@ -48,4 +49,26 @@ public class MDStoreCurrentVersion implements Serializable {
 	public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) {
 		return newInstance(v.getMdstore(), v.getId());
 	}
+
+	@Override
+	public String toString() {
+		return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion);
+	}
+
+	@Override
+	public int hashCode() {
+		return Objects.hash(currentVersion, mdstore);
+	}
+
+	@Override
+	public boolean equals(final Object obj) {
+		if (this == obj) {
+			return true;
+		}
+		if (!(obj instanceof MDStoreCurrentVersion)) {
+			return false;
+		}
+		final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj;
+		return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore);
+	}
 }
--- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java
+++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java
@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model;

 import java.io.Serializable;
 import java.util.Date;
+import java.util.Objects;

 import javax.persistence.Column;
 import javax.persistence.Entity;
@ -38,15 +39,22 @@ public class MDStoreVersion implements Serializable {
 	@Column(name = "size")
 	private long size = 0;

-	public static MDStoreVersion newInstance(final String mdId, final boolean writing) {
-		final MDStoreVersion t = new MDStoreVersion();
-		t.setId(mdId + "-" + new Date().getTime());
-		t.setMdstore(mdId);
-		t.setLastUpdate(null);
-		t.setWriting(writing);
-		t.setReadCount(0);
-		t.setSize(0);
-		return t;
+	@Column(name = "hdfs_path")
+	private String hdfsPath;
+
+	public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) {
+		final MDStoreVersion v = new MDStoreVersion();
+
+		final String versionId = mdId + "-" + new Date().getTime();
+		v.setId(versionId);
+		v.setMdstore(mdId);
+		v.setLastUpdate(null);
+		v.setWriting(writing);
+		v.setReadCount(0);
+		v.setSize(0);
+		v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId));
+
+		return v;
 	}

 	public String getId() {
@ -96,4 +104,37 @@ public class MDStoreVersion implements Serializable {
 	public void setSize(final long size) {
 		this.size = size;
 	}
+
+	public String getHdfsPath() {
+		return hdfsPath;
+	}
+
+	public void setHdfsPath(final String hdfsPath) {
+		this.hdfsPath = hdfsPath;
+	}
+
+	@Override
+	public String toString() {
+		return String
+			.format(
+				"MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id,
+				mdstore, writing, readCount, lastUpdate, size, hdfsPath);
+	}
+
+	@Override
+	public int hashCode() {
+		return Objects.hash(id);
+	}
+
+	@Override
+	public boolean equals(final Object obj) {
+		if (this == obj) {
+			return true;
+		}
+		if (!(obj instanceof MDStoreVersion)) {
+			return false;
+		}
+		final MDStoreVersion other = (MDStoreVersion) obj;
+		return Objects.equals(id, other.id);
+	}
 }
--- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java
+++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java
@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model;

 import java.io.Serializable;
 import java.util.Date;
+import java.util.Objects;

 import javax.persistence.Column;
 import javax.persistence.Entity;
@ -43,6 +44,10 @@ public class MDStoreWithInfo implements Serializable {
 	@Column(name = "current_version")
 	private String currentVersion;

+	@Column(name = "creation_date")
+	@Temporal(TemporalType.TIMESTAMP)
+	private Date creationDate;
+
 	@Column(name = "lastupdate")
 	@Temporal(TemporalType.TIMESTAMP)
 	private Date lastUpdate;
@ -53,6 +58,9 @@ public class MDStoreWithInfo implements Serializable {
 	@Column(name = "n_versions")
 	private long numberOfVersions = 0;

+	@Column(name = "hdfs_path")
+	private String hdfsPath;
+
 	public String getId() {
 		return id;
 	}
@ -117,6 +125,14 @@ public class MDStoreWithInfo implements Serializable {
 		this.currentVersion = currentVersion;
 	}

+	public Date getCreationDate() {
+		return creationDate;
+	}
+
+	public void setCreationDate(final Date creationDate) {
+		this.creationDate = creationDate;
+	}
+
 	public Date getLastUpdate() {
 		return lastUpdate;
 	}
@ -140,4 +156,39 @@ public class MDStoreWithInfo implements Serializable {
 	public void setNumberOfVersions(final long numberOfVersions) {
 		this.numberOfVersions = numberOfVersions;
 	}
+
+	public String getHdfsPath() {
+		return hdfsPath;
+	}
+
+	public void setHdfsPath(final String hdfsPath) {
+		this.hdfsPath = hdfsPath;
+	}
+
+	@Override
+	public String toString() {
+		return String
+			.format(
+				"MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]",
+				id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate,
+				lastUpdate, size, numberOfVersions, hdfsPath);
+	}
+
+	@Override
+	public int hashCode() {
+		return Objects.hash(id);
+	}
+
+	@Override
+	public boolean equals(final Object obj) {
+		if (this == obj) {
+			return true;
+		}
+		if (!(obj instanceof MDStoreWithInfo)) {
+			return false;
+		}
+		final MDStoreWithInfo other = (MDStoreWithInfo) obj;
+		return Objects.equals(id, other.id);
+	}
+
 }
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java
@ -0,0 +1,14 @@
+
+package eu.dnetlib.dhp.application;
+
+import java.io.*;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.Maps;
+
+public class ApplicationUtils {
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java
@ -1,10 +1,7 @@

 package eu.dnetlib.dhp.application;

-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.Serializable;
-import java.io.StringWriter;
+import java.io.*;
 import java.util.*;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream;
 import org.apache.commons.cli.*;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

 public class ArgumentApplicationParser implements Serializable {

+	private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
+
 	private final Options options = new Options();
 	private final Map<String, String> objectMap = new HashMap<>();

 	private final List<String> compressedValues = new ArrayList<>();

-	public ArgumentApplicationParser(final String json_configuration) throws Exception {
+	public ArgumentApplicationParser(final String json_configuration) throws IOException {
 		final ObjectMapper mapper = new ObjectMapper();
 		final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
 		createOptionMap(configuration);
@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable {
 	}

 	private void createOptionMap(final OptionsParameter[] configuration) {
-
 		Arrays
 			.stream(configuration)
 			.map(
@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable {
 					return o;
 				})
 			.forEach(options::addOption);
-
-		// HelpFormatter formatter = new HelpFormatter();
-		// formatter.printHelp("myapp", null, options, null, true);
-
 	}

 	public static String decompressValue(final String abstractCompressed) {
@ -61,7 +57,7 @@ public class ArgumentApplicationParser implements Serializable {
 			IOUtils.copy(gis, stringWriter);
 			return stringWriter.toString();
 		} catch (Throwable e) {
-			System.out.println("Wrong value to decompress:" + abstractCompressed);
+			log.error("Wrong value to decompress:" + abstractCompressed);
 			throw new RuntimeException(e);
 		}
 	}
@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable {
 		return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
 	}

-	public void parseArgument(final String[] args) throws Exception {
+	public void parseArgument(final String[] args) throws ParseException {
 		CommandLineParser parser = new BasicParser();
 		CommandLine cmd = parser.parse(options, args);
 		Arrays
--- a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java
+++ b/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java
@ -1,5 +1,5 @@

-package eu.dnetlib.collector.worker.model;
+package eu.dnetlib.dhp.collection;

 import java.util.HashMap;
 import java.util.Map;
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
@ -27,4 +27,24 @@ public class Constants {
 		coarCodeLabelMap.put("c_f1cf", "EMBARGO");
 	}

+	public static final String SEQUENCE_FILE_NAME = "/sequence_file";
+	public static final String REPORT_FILE_NAME = "/report";
+	public static final String MDSTORE_DATA_PATH = "/store";
+	public static final String MDSTORE_SIZE_PATH = "/size";
+
+	public static final String COLLECTION_MODE = "collectionMode";
+	public static final String METADATA_ENCODING = "metadataEncoding";
+	public static final String OOZIE_WF_PATH = "oozieWfPath";
+	public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL";
+
+	public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry";
+	public static final String REQUEST_DELAY = "requestDelay";
+	public static final String RETRY_DELAY = "retryDelay";
+	public static final String CONNECT_TIMEOUT = "connectTimeOut";
+	public static final String READ_TIMEOUT = "readTimeOut";
+
+	public static final String CONTENT_TOTALITEMS = "TotalItems";
+	public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
+	public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
+
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java
@ -1,39 +1,60 @@

-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.common;

 import java.io.Closeable;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Optional;
 import java.util.stream.StreamSupport;

 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.bson.Document;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import com.google.common.collect.Iterables;
+import com.mongodb.BasicDBObject;
 import com.mongodb.MongoClient;
 import com.mongodb.MongoClientURI;
+import com.mongodb.QueryBuilder;
 import com.mongodb.client.MongoCollection;
 import com.mongodb.client.MongoDatabase;

 public class MdstoreClient implements Closeable {

+	private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class);
+
 	private final MongoClient client;
 	private final MongoDatabase db;

 	private static final String COLL_METADATA = "metadata";
 	private static final String COLL_METADATA_MANAGER = "metadataManager";

-	private static final Log log = LogFactory.getLog(MdstoreClient.class);
-
 	public MdstoreClient(final String baseUrl, final String dbName) {
 		this.client = new MongoClient(new MongoClientURI(baseUrl));
 		this.db = getDb(client, dbName);
 	}

+	public MongoCollection<Document> mdStore(final String mdId) {
+		BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
+
+		log.info("querying current mdId: {}", query.toJson());
+
+		final String currentId = Optional
+			.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query))
+			.map(r -> r.first())
+			.map(d -> d.getString("currentId"))
+			.orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId));
+
+		log.info("currentId: {}", currentId);
+
+		return getColl(db, currentId, true);
+	}
+
 	public Map<String, String> validCollections(
 		final String mdFormat, final String mdLayout, final String mdInterpretation) {

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java
@ -0,0 +1,72 @@
+
+package eu.dnetlib.dhp.common.rest;
+
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.client.methods.HttpUriRequest;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+public class DNetRestClient {
+
+	private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
+
+	private static ObjectMapper mapper = new ObjectMapper();
+
+	public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
+		final HttpGet httpGet = new HttpGet(url);
+		return doHTTPRequest(httpGet, clazz);
+	}
+
+	public static String doGET(final String url) throws Exception {
+		final HttpGet httpGet = new HttpGet(url);
+		return doHTTPRequest(httpGet);
+	}
+
+	public static <V> String doPOST(final String url, V objParam) throws Exception {
+		final HttpPost httpPost = new HttpPost(url);
+
+		if (objParam != null) {
+			final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
+			httpPost.setEntity(entity);
+			httpPost.setHeader("Accept", "application/json");
+			httpPost.setHeader("Content-type", "application/json");
+		}
+		return doHTTPRequest(httpPost);
+	}
+
+	public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
+		return mapper.readValue(doPOST(url, objParam), clazz);
+	}
+
+	private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
+		CloseableHttpClient client = HttpClients.createDefault();
+
+		log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
+		log
+			.info(
+				"request headers: {}",
+				Arrays
+					.asList(r.getAllHeaders())
+					.stream()
+					.map(h -> h.getName() + ":" + h.getValue())
+					.collect(Collectors.joining(",")));
+
+		CloseableHttpResponse response = client.execute(r);
+		return IOUtils.toString(response.getEntity().getContent());
+	}
+
+	private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
+		return mapper.readValue(doHTTPRequest(r), clazz);
+	}
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.common.vocabulary;

 import java.io.Serializable;
 import java.util.HashMap;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.common.vocabulary;

 import java.io.Serializable;
 import java.util.*;
@ -67,6 +67,10 @@ public class VocabularyGroup implements Serializable {

 	private final Map<String, Vocabulary> vocs = new HashMap<>();

+	public Set<String> vocabularyNames() {
+		return vocs.keySet();
+	}
+
 	public void addVocabulary(final String id, final String name) {
 		vocs.put(id.toLowerCase(), new Vocabulary(id, name));
 	}
@ -118,7 +122,31 @@ public class VocabularyGroup implements Serializable {
 		return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
 	}

+	/**
+	 * getSynonymAsQualifierCaseSensitive
+	 *
+	 * refelects the situation to check caseSensitive vocabulary
+	 */
+	public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) {
+		if (StringUtils.isBlank(vocId)) {
+			return OafMapperUtils.unknown("", "");
+		}
+		return vocs.get(vocId).getSynonymAsQualifier(syn);
+	}
+
+	/**
+	 * termExists
+	 *
+	 * two methods: without and with caseSensitive check
+	 */
 	public boolean termExists(final String vocId, final String id) {
+		return termExists(vocId, id, Boolean.FALSE);
+	}
+
+	public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) {
+		if (Boolean.TRUE.equals(caseSensitive)) {
+			return vocabularyExists(vocId) && vocs.get(vocId).termExists(id);
+		}
 		return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
 	}

--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.oa.graph.raw.common;
+package eu.dnetlib.dhp.common.vocabulary;

 import java.io.Serializable;

--- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java
@ -0,0 +1,64 @@
+
+package eu.dnetlib.dhp.message;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class Message implements Serializable {
+
+	private static final long serialVersionUID = 401753881204524893L;
+
+	public static String CURRENT_PARAM = "current";
+	public static String TOTAL_PARAM = "total";
+
+	private MessageType messageType;
+
+	private String workflowId;
+
+	private Map<String, String> body;
+
+	public Message() {
+	}
+
+	public Message(final MessageType messageType, final String workflowId) {
+		this(messageType, workflowId, new LinkedHashMap<>());
+	}
+
+	public Message(final MessageType messageType, final String workflowId, final Map<String, String> body) {
+		this.messageType = messageType;
+		this.workflowId = workflowId;
+		this.body = body;
+	}
+
+	public MessageType getMessageType() {
+		return messageType;
+	}
+
+	public void setMessageType(MessageType messageType) {
+		this.messageType = messageType;
+	}
+
+	public String getWorkflowId() {
+		return workflowId;
+	}
+
+	public void setWorkflowId(final String workflowId) {
+		this.workflowId = workflowId;
+	}
+
+	public Map<String, String> getBody() {
+		return body;
+	}
+
+	public void setBody(final Map<String, String> body) {
+		this.body = body;
+	}
+
+	@Override
+	public String toString() {
+		return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body);
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java
@ -0,0 +1,94 @@
+
+package eu.dnetlib.dhp.message;
+
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpPut;
+import org.apache.http.entity.ContentType;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+public class MessageSender {
+
+	private static final Logger log = LoggerFactory.getLogger(MessageSender.class);
+
+	private static final int SOCKET_TIMEOUT_MS = 2000;
+
+	private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000;
+
+	private static final int CONNTECTION_TIMEOUT_MS = 2000;
+
+	private final ObjectMapper objectMapper = new ObjectMapper();
+
+	private final String dnetMessageEndpoint;
+
+	private final String workflowId;
+
+	private ExecutorService executorService = Executors.newCachedThreadPool();
+
+	public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
+		this.workflowId = workflowId;
+		this.dnetMessageEndpoint = dnetMessageEndpoint;
+	}
+
+	public void sendMessage(final Message message) {
+		executorService.submit(() -> _sendMessage(message));
+	}
+
+	public void sendMessage(final Long current, final Long total) {
+		sendMessage(createOngoingMessage(current, total));
+	}
+
+	public void sendReport(final Map<String, String> report) {
+		sendMessage(new Message(MessageType.REPORT, workflowId, report));
+	}
+
+	private Message createOngoingMessage(final Long current, final Long total) {
+		final Message m = new Message(MessageType.ONGOING, workflowId);
+		m.getBody().put(Message.CURRENT_PARAM, current.toString());
+		if (total != null) {
+			m.getBody().put(Message.TOTAL_PARAM, total.toString());
+		}
+		return m;
+	}
+
+	private void _sendMessage(final Message message) {
+		try {
+			final String json = objectMapper.writeValueAsString(message);
+
+			final HttpPut req = new HttpPut(dnetMessageEndpoint);
+			req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON));
+
+			final RequestConfig requestConfig = RequestConfig
+				.custom()
+				.setConnectTimeout(CONNTECTION_TIMEOUT_MS)
+				.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS)
+				.setSocketTimeout(SOCKET_TIMEOUT_MS)
+				.build();
+
+			try (final CloseableHttpClient client = HttpClients
+				.custom()
+				.setDefaultRequestConfig(requestConfig)
+				.build();
+				final CloseableHttpResponse response = client.execute(req)) {
+				log.debug("Sent Message to " + dnetMessageEndpoint);
+				log.debug("MESSAGE:" + message);
+			} catch (final Throwable e) {
+				log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
+			}
+		} catch (final JsonProcessingException e) {
+			log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
+		}
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java
@ -0,0 +1,21 @@
+
+package eu.dnetlib.dhp.message;
+
+import java.io.Serializable;
+import java.util.Optional;
+
+import org.apache.commons.lang3.StringUtils;
+
+public enum MessageType implements Serializable {
+
+	ONGOING, REPORT;
+
+	public MessageType from(String value) {
+		return Optional
+			.ofNullable(value)
+			.map(StringUtils::upperCase)
+			.map(MessageType::valueOf)
+			.orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value));
+	}
+
+}
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
@ -149,17 +149,26 @@ public class CleaningFunctions {
 			if (Objects.nonNull(r.getInstance())) {

 				for (Instance i : r.getInstance()) {
-					final Set<StructuredProperty> pids = Sets.newHashSet(i.getPid());
-					i
-						.setAlternateIdentifier(
-							Optional
-								.ofNullable(i.getAlternateIdentifier())
-								.map(
-									altId -> altId
+					Optional
+						.ofNullable(i.getPid())
+						.ifPresent(pid -> {
+							final Set<StructuredProperty> pids =
+									pid
 										.stream()
-										.filter(p -> !pids.contains(p))
-										.collect(Collectors.toList()))
-								.orElse(Lists.newArrayList()));
+										.filter(Objects::nonNull)
+										.filter(p -> StringUtils.isNotBlank(p.getValue()))
+										.collect(Collectors.toCollection(HashSet::new));
+
+							Optional.ofNullable(i.getAlternateIdentifier())
+									.ifPresent(altId -> {
+										final Set<StructuredProperty> altIds = altId.stream()
+												.filter(Objects::nonNull)
+												.filter(p -> StringUtils.isNotBlank(p.getValue()))
+												.collect(Collectors.toCollection(HashSet::new));
+
+										i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
+									});
+						});

 					if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
 						i
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java
@ -1,18 +1,29 @@

 package eu.dnetlib.dhp.utils;

-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
+import java.io.*;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
 import java.util.List;
+import java.util.Map;
+import java.util.Properties;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;

 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.codec.binary.Base64OutputStream;
 import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.SaveMode;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Maps;
 import com.jayway.jsonpath.JsonPath;

 import net.minidev.json.JSONArray;
@ -21,6 +32,8 @@ import scala.collection.Seq;

 public class DHPUtils {

+	private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
+
 	public static Seq<String> toSeq(List<String> list) {
 		return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
 	}
@ -79,4 +92,72 @@ public class DHPUtils {
 			return "";
 		}
 	}
+
+	public static final ObjectMapper MAPPER = new ObjectMapper();
+
+	public static void writeHdfsFile(final Configuration conf, final String content, final String path)
+		throws IOException {
+
+		log.info("writing file {}, size {}", path, content.length());
+		try (FileSystem fs = FileSystem.get(conf);
+			BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
+			os.write(content.getBytes(StandardCharsets.UTF_8));
+			os.flush();
+		}
+	}
+
+	public static String readHdfsFile(Configuration conf, String path) throws IOException {
+		log.info("reading file {}", path);
+
+		try (FileSystem fs = FileSystem.get(conf)) {
+			final Path p = new Path(path);
+			if (!fs.exists(p)) {
+				throw new FileNotFoundException(path);
+			}
+			return IOUtils.toString(fs.open(p));
+		}
+	}
+
+	public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
+		return MAPPER.readValue(readHdfsFile(conf, path), clazz);
+	}
+
+	public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
+		log.info("saving dataset in: {}", targetPath);
+		mdstore
+			.write()
+			.mode(SaveMode.Overwrite)
+			.format("parquet")
+			.save(targetPath);
+	}
+
+	public static Configuration getHadoopConfiguration(String nameNode) {
+		// ====== Init HDFS File System Object
+		Configuration conf = new Configuration();
+		// Set FileSystem URI
+		conf.set("fs.defaultFS", nameNode);
+		// Because of Maven
+		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
+
+		System.setProperty("hadoop.home.dir", "/");
+		return conf;
+	}
+
+	public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
+		File file = new File(System.getProperty("oozie.action.output.properties"));
+		Properties props = new Properties();
+		report.forEach((k, v) -> props.setProperty(k, v));
+
+		try (OutputStream os = new FileOutputStream(file)) {
+			props.store(os, "");
+		}
+	}
+
+	public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
+		Map<String, String> report = Maps.newHashMap();
+		report.put(paramName, value);
+
+		populateOOZIEEnv(report);
+	}
 }
--- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java
+++ b/dhp-common/src/main/java/eu/dnetlib/message/Message.java
@ -1,76 +0,0 @@
-
-package eu.dnetlib.message;
-
-import java.io.IOException;
-import java.util.Map;
-
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-public class Message {
-
-	private String workflowId;
-
-	private String jobName;
-
-	private MessageType type;
-
-	private Map<String, String> body;
-
-	public static Message fromJson(final String json) throws IOException {
-		final ObjectMapper jsonMapper = new ObjectMapper();
-		return jsonMapper.readValue(json, Message.class);
-	}
-
-	public Message() {
-	}
-
-	public Message(String workflowId, String jobName, MessageType type, Map<String, String> body) {
-		this.workflowId = workflowId;
-		this.jobName = jobName;
-		this.type = type;
-		this.body = body;
-	}
-
-	public String getWorkflowId() {
-		return workflowId;
-	}
-
-	public void setWorkflowId(String workflowId) {
-		this.workflowId = workflowId;
-	}
-
-	public String getJobName() {
-		return jobName;
-	}
-
-	public void setJobName(String jobName) {
-		this.jobName = jobName;
-	}
-
-	public MessageType getType() {
-		return type;
-	}
-
-	public void setType(MessageType type) {
-		this.type = type;
-	}
-
-	public Map<String, String> getBody() {
-		return body;
-	}
-
-	public void setBody(Map<String, String> body) {
-		this.body = body;
-	}
-
-	@Override
-	public String toString() {
-		final ObjectMapper jsonMapper = new ObjectMapper();
-		try {
-			return jsonMapper.writeValueAsString(this);
-		} catch (JsonProcessingException e) {
-			return null;
-		}
-	}
-}
--- a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java
+++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java
@ -1,47 +0,0 @@
-
-package eu.dnetlib.message;
-
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.util.concurrent.LinkedBlockingQueue;
-
-import com.rabbitmq.client.AMQP;
-import com.rabbitmq.client.Channel;
-import com.rabbitmq.client.DefaultConsumer;
-import com.rabbitmq.client.Envelope;
-
-public class MessageConsumer extends DefaultConsumer {
-
-	final LinkedBlockingQueue<Message> queueMessages;
-
-	/**
-	 * Constructs a new instance and records its association to the passed-in channel.
-	 *
-	 * @param channel the channel to which this consumer is attached
-	 * @param queueMessages
-	 */
-	public MessageConsumer(Channel channel, LinkedBlockingQueue<Message> queueMessages) {
-		super(channel);
-		this.queueMessages = queueMessages;
-	}
-
-	@Override
-	public void handleDelivery(
-		String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body)
-		throws IOException {
-		final String json = new String(body, StandardCharsets.UTF_8);
-		Message message = Message.fromJson(json);
-		try {
-			this.queueMessages.put(message);
-			System.out.println("Receiving Message " + message);
-		} catch (InterruptedException e) {
-			if (message.getType() == MessageType.REPORT)
-				throw new RuntimeException("Error on sending message");
-			else {
-				// TODO LOGGING EXCEPTION
-			}
-		} finally {
-			getChannel().basicAck(envelope.getDeliveryTag(), false);
-		}
-	}
-}
--- a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java
+++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java
@ -1,136 +0,0 @@
-
-package eu.dnetlib.message;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.TimeoutException;
-
-import com.rabbitmq.client.Channel;
-import com.rabbitmq.client.Connection;
-import com.rabbitmq.client.ConnectionFactory;
-
-public class MessageManager {
-
-	private final String messageHost;
-
-	private final String username;
-
-	private final String password;
-
-	private Connection connection;
-
-	private final Map<String, Channel> channels = new HashMap<>();
-
-	private boolean durable;
-
-	private boolean autodelete;
-
-	private final LinkedBlockingQueue<Message> queueMessages;
-
-	public MessageManager(
-		String messageHost,
-		String username,
-		String password,
-		final LinkedBlockingQueue<Message> queueMessages) {
-		this.queueMessages = queueMessages;
-		this.messageHost = messageHost;
-		this.username = username;
-		this.password = password;
-	}
-
-	public MessageManager(
-		String messageHost,
-		String username,
-		String password,
-		boolean durable,
-		boolean autodelete,
-		final LinkedBlockingQueue<Message> queueMessages) {
-		this.queueMessages = queueMessages;
-		this.messageHost = messageHost;
-		this.username = username;
-		this.password = password;
-
-		this.durable = durable;
-		this.autodelete = autodelete;
-	}
-
-	private Connection createConnection() throws IOException, TimeoutException {
-		ConnectionFactory factory = new ConnectionFactory();
-		factory.setHost(this.messageHost);
-		factory.setUsername(this.username);
-		factory.setPassword(this.password);
-		return factory.newConnection();
-	}
-
-	private Channel createChannel(
-		final Connection connection,
-		final String queueName,
-		final boolean durable,
-		final boolean autodelete)
-		throws Exception {
-		Map<String, Object> args = new HashMap<>();
-		args.put("x-message-ttl", 10000);
-		Channel channel = connection.createChannel();
-		channel.queueDeclare(queueName, durable, false, this.autodelete, args);
-		return channel;
-	}
-
-	private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete)
-		throws Exception {
-		if (channels.containsKey(queueName)) {
-			return channels.get(queueName);
-		}
-
-		if (this.connection == null) {
-			this.connection = createConnection();
-		}
-		channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete));
-		return channels.get(queueName);
-	}
-
-	public void close() throws IOException {
-		channels
-			.values()
-			.forEach(
-				ch -> {
-					try {
-						ch.close();
-					} catch (Exception e) {
-						// TODO LOG
-					}
-				});
-
-		this.connection.close();
-	}
-
-	public boolean sendMessage(final Message message, String queueName) throws Exception {
-		try {
-			Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete);
-			channel.basicPublish("", queueName, null, message.toString().getBytes());
-			return true;
-		} catch (Throwable e) {
-			throw new RuntimeException(e);
-		}
-	}
-
-	public boolean sendMessage(
-		final Message message, String queueName, boolean durable_var, boolean autodelete_var)
-		throws Exception {
-		try {
-			Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var);
-			channel.basicPublish("", queueName, null, message.toString().getBytes());
-			return true;
-		} catch (Throwable e) {
-			throw new RuntimeException(e);
-		}
-	}
-
-	public void startConsumingMessage(
-		final String queueName, final boolean durable, final boolean autodelete) throws Exception {
-
-		Channel channel = createChannel(createConnection(), queueName, durable, autodelete);
-		channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages));
-	}
-}
--- a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java
+++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java
@ -1,6 +0,0 @@
-
-package eu.dnetlib.message;
-
-public enum MessageType {
-	ONGOING, REPORT
-}
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java
@ -1,16 +0,0 @@
-
-package eu.dnetlib.dhp.model.mdstore;
-
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import org.junit.jupiter.api.Test;
-
-public class MetadataRecordTest {
-
-	@Test
-	public void getTimestamp() {
-
-		MetadataRecord r = new MetadataRecord();
-		assertTrue(r.getDateOfCollection() > 0);
-	}
-}
--- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java
@ -1,51 +0,0 @@
-
-package eu.dnetlib.message;
-
-import static org.junit.jupiter.api.Assertions.*;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.junit.jupiter.api.Test;
-
-public class MessageTest {
-
-	@Test
-	public void fromJsonTest() throws IOException {
-		Message m = new Message();
-		m.setWorkflowId("wId");
-		m.setType(MessageType.ONGOING);
-		m.setJobName("Collection");
-		Map<String, String> body = new HashMap<>();
-		body.put("parsedItem", "300");
-		body.put("ExecutionTime", "30s");
-
-		m.setBody(body);
-		System.out.println("m = " + m);
-		Message m1 = Message.fromJson(m.toString());
-		assertEquals(m1.getWorkflowId(), m.getWorkflowId());
-		assertEquals(m1.getType(), m.getType());
-		assertEquals(m1.getJobName(), m.getJobName());
-
-		assertNotNull(m1.getBody());
-		m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it)));
-		assertEquals(m1.getJobName(), m.getJobName());
-	}
-
-	@Test
-	public void toStringTest() {
-		final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}";
-		Message m = new Message();
-		m.setWorkflowId("wId");
-		m.setType(MessageType.ONGOING);
-		m.setJobName("Collection");
-		Map<String, String> body = new HashMap<>();
-		body.put("parsedItem", "300");
-		body.put("ExecutionTime", "30s");
-
-		m.setBody(body);
-
-		assertEquals(expectedJson, m.toString());
-	}
-}
--- a/dhp-schemas/pom.xml
+++ b/dhp-schemas/pom.xml
@ -67,6 +67,11 @@
            <artifactId>guava</artifactId>
        </dependency>

+        <dependency>
+            <groupId>commons-codec</groupId>
+            <artifactId>commons-codec</artifactId>
+        </dependency>
+
    </dependencies>


--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java
@ -3,17 +3,19 @@ package eu.dnetlib.dhp.schema.common;

 import static com.google.common.base.Preconditions.checkArgument;

+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 import java.text.ParseException;
-import java.text.SimpleDateFormat;
 import java.time.Instant;
 import java.time.format.DateTimeFormatter;
-import java.time.temporal.TemporalAccessor;
 import java.util.Date;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.function.Function;

+import org.apache.commons.codec.binary.Hex;
 import org.apache.commons.lang3.StringUtils;

 import com.google.common.collect.Maps;
@ -252,13 +254,6 @@ public class ModelSupport {
 					.setRelation("isRelatedTo")
 					.setRelType("resultResult")
 					.setSubReltype("relationship"));
-		relationInverseMap
-			.put(
-				"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
-					.setInverse("hasAmongTopNSimilarDocuments")
-					.setRelation("isAmongTopNSimilarDocuments")
-					.setRelType("resultResult")
-					.setSubReltype("similarity"));
 		relationInverseMap
 			.put(
 				"resultResult_supplement_isSupplementTo", new RelationInverse()
@ -482,6 +477,20 @@ public class ModelSupport {
 		return ((OafEntity) t).getId();
 	}

+	public static String md5(final String s) {
+		try {
+			final MessageDigest md = MessageDigest.getInstance("MD5");
+			md.update(s.getBytes(StandardCharsets.UTF_8));
+			return new String(Hex.encodeHex(md.digest()));
+		} catch (final NoSuchAlgorithmException e) {
+			throw new IllegalStateException(e);
+		}
+	}
+
+	public static String generateIdentifier(final String originalId, final String nsPrefix) {
+		return String.format("%s::%s", nsPrefix, md5(originalId));
+	}
+
 	public static String oldest(String dateA, String dateB) throws ParseException {

 		if (StringUtils.isBlank(dateA)) {
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java
@ -1,11 +1,13 @@

-package eu.dnetlib.dhp.model.mdstore;
+package eu.dnetlib.dhp.schema.mdstore;

 import java.io.Serializable;

-import eu.dnetlib.dhp.utils.DHPUtils;
+import eu.dnetlib.dhp.schema.common.ModelSupport;

-/** This class models a record inside the new Metadata store collection on HDFS * */
+/**
+ * This class models a record in a Metadata store collection on HDFS
+ */
 public class MetadataRecord implements Serializable {

 	/** The D-Net Identifier associated to the record */
@ -26,13 +28,13 @@ public class MetadataRecord implements Serializable {
 	private String body;

 	/** the date when the record has been stored */
-	private long dateOfCollection;
+	private Long dateOfCollection;

 	/** the date when the record has been stored */
-	private long dateOfTransformation;
+	private Long dateOfTransformation;

 	public MetadataRecord() {
-		this.dateOfCollection = System.currentTimeMillis();
+
 	}

 	public MetadataRecord(
@ -40,14 +42,14 @@ public class MetadataRecord implements Serializable {
 		String encoding,
 		Provenance provenance,
 		String body,
-		long dateOfCollection) {
+		Long dateOfCollection) {

 		this.originalId = originalId;
 		this.encoding = encoding;
 		this.provenance = provenance;
 		this.body = body;
 		this.dateOfCollection = dateOfCollection;
-		this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix());
+		this.id = ModelSupport.generateIdentifier(originalId, this.provenance.getNsPrefix());
 	}

 	public String getId() {
@ -90,19 +92,19 @@ public class MetadataRecord implements Serializable {
 		this.body = body;
 	}

-	public long getDateOfCollection() {
+	public Long getDateOfCollection() {
 		return dateOfCollection;
 	}

-	public void setDateOfCollection(long dateOfCollection) {
+	public void setDateOfCollection(Long dateOfCollection) {
 		this.dateOfCollection = dateOfCollection;
 	}

-	public long getDateOfTransformation() {
+	public Long getDateOfTransformation() {
 		return dateOfTransformation;
 	}

-	public void setDateOfTransformation(long dateOfTransformation) {
+	public void setDateOfTransformation(Long dateOfTransformation) {
 		this.dateOfTransformation = dateOfTransformation;
 	}

--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/Provenance.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/Provenance.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.model.mdstore;
+package eu.dnetlib.dhp.schema.mdstore;

 import java.io.Serializable;

--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala
@ -15,11 +15,11 @@ object OafUtils {
  }


-  def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
+  def generateDataInfo(trust: String = "0.9", invisible: Boolean = false): DataInfo = {
    val di = new DataInfo
    di.setDeletedbyinference(false)
    di.setInferred(false)
-    di.setInvisible(false)
+    di.setInvisible(invisible)
    di.setTrust(trust)
    di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
    di
--- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java
+++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.schema.oaf;

 import static org.junit.jupiter.api.Assertions.*;

+import java.time.format.DateTimeParseException;
 import java.util.Arrays;
 import java.util.List;

@ -94,7 +95,7 @@ public class MergeTest {

 	@Test
 	public void mergeRelationTestParseException() {
-		assertThrows(IllegalArgumentException.class, () -> {
+		assertThrows(DateTimeParseException.class, () -> {
 			Relation a = createRel(true, "2016-04-05");
 			Relation b = createRel(true, "2016-04-05");
 			a.mergeFrom(b);
--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java
@ -5,12 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;

 import java.io.IOException;
-import java.util.Objects;
 import java.util.Optional;
 import java.util.function.BiFunction;
 import java.util.function.Function;

 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
@ -194,7 +194,7 @@ public class PromoteActionPayloadForGraphTableJob {
 		SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
 		SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
 		SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
-		SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource;
+		SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;

 		Dataset<G> joinedAndMerged = PromoteActionPayloadFunctions
 			.joinGraphTableWithActionPayloadAndMerge(
@ -238,12 +238,13 @@ public class PromoteActionPayloadForGraphTableJob {
 		}
 	}

-	private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSource() {
+	private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSourceAndTarget() {
 		return t -> {
 			if (isSubClass(t, Relation.class)) {
-				return Objects.nonNull(((Relation) t).getSource());
+				final Relation rel = (Relation) t;
+				return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget());
 			}
-			return Objects.nonNull(((OafEntity) t).getId());
+			return StringUtils.isNotBlank(((OafEntity) t).getId());
 		};
 	}

--- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java
+++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java
@ -112,6 +112,7 @@ public class PromoteActionPayloadFunctions {
 		Class<G> rowClazz) {
 		TypedColumn<G, G> aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn();
 		return rowDS
+			.filter((FilterFunction<G>) o -> isNotZeroFn.get().apply(o))
 			.groupByKey((MapFunction<G, String>) x -> rowIdFn.get().apply(x), Encoders.STRING())
 			.agg(aggregator)
 			.map((MapFunction<Tuple2<String, G>, G>) Tuple2::_2, Encoders.kryo(rowClazz));
--- a/dhp-workflows/dhp-aggregation/README.md
+++ b/dhp-workflows/dhp-aggregation/README.md
@ -1,29 +1,27 @@
 Description of the Module
 --------------------------
-This module defines a **collector worker application** that runs on Hadoop.
+This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
+Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
+the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping 
+of each MDStore.

-It is responsible for harvesting metadata using different plugins.
+## Metadata collection

-The collector worker uses a message queue to inform the progress 
-of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore, 
-It gives, at the end of the job, some information about the status 
-of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages).
+The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to 
+different formats and to store them as on HDFS so that they can be further processed. 

-To work the collection worker need some parameter like:
+### Collector Plugins

-* **hdfsPath**: the path where storing the sequential file
-* **apidescriptor**: the JSON encoding of the API Descriptor
-* **namenode**: the Name Node URI
-* **userHDFS**: the user wich create the hdfs seq file
-* **rabbitUser**: the user to connect with RabbitMq for messaging
-* **rabbitPassWord**: the password to connect with RabbitMq for messaging
-* **rabbitHost**: the host of the RabbitMq server
-* **rabbitOngoingQueue**: the name of the ongoing queue
-* **rabbitReportQueue**: the name of the report queue
-* **workflowId**: the identifier of the dnet Workflow
+Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface:

-##Plugins
-* OAI Plugin 
+```eu.dnetlib.dhp.collection.plugin.CollectorPlugin```

-## Usage
+The list of the supported plugins:
+
+* OAI Plugin: collects from OAI-PMH compatible endpoints
+* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID)
+* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter 
+
+# Transformation Plugins
 TODO
+
--- a/dhp-workflows/dhp-aggregation/pom.xml
+++ b/dhp-workflows/dhp-aggregation/pom.xml
@ -7,10 +7,44 @@
        <version>1.2.4-SNAPSHOT</version>
    </parent>
    <artifactId>dhp-aggregation</artifactId>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>${net.alchim31.maven.version}</version>
+                <executions>
+                    <execution>
+                        <id>scala-compile-first</id>
+                        <phase>initialize</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-test-compile</id>
+                        <phase>process-test-resources</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                </configuration>
+            </plugin>
+        </plugins>

+    </build>

    <dependencies>

+        <dependency>
+            <groupId>org.apache.httpcomponents</groupId>
+            <artifactId>httpclient</artifactId>
+        </dependency>
+
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
@ -24,15 +58,9 @@
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-common</artifactId>
            <version>${project.version}</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>com.sun.xml.bind</groupId>
-                    <artifactId>jaxb-core</artifactId>
-                </exclusion>
-            </exclusions>
        </dependency>

-         <dependency>
+        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-schemas</artifactId>
            <version>${project.version}</version>
@ -58,6 +86,11 @@
            <artifactId>jaxen</artifactId>
        </dependency>

+        <dependency>
+            <groupId>org.json</groupId>
+            <artifactId>json</artifactId>
+        </dependency>
+
        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
        <dependency>
            <groupId>org.apache.commons</groupId>
@ -78,7 +111,10 @@
            <artifactId>commons-compress</artifactId>
        </dependency>

-
+        <dependency>
+            <groupId>org.mongodb</groupId>
+            <artifactId>mongo-java-driver</artifactId>
+        </dependency>

    </dependencies>

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
@ -0,0 +1,86 @@
+package eu.dnetlib.dhp.actionmanager.datacite
+
+import org.apache.commons.io.IOUtils
+import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
+import org.apache.http.entity.StringEntity
+import org.apache.http.impl.client.HttpClients
+
+import java.io.IOException
+
+abstract class AbstractRestClient extends Iterator[String]{
+
+  var buffer: List[String] = List()
+  var current_index:Int = 0
+
+  var scroll_value: Option[String] = None
+
+  var complete:Boolean = false
+
+
+  def extractInfo(input: String): Unit
+
+  protected def getBufferData(): Unit
+
+
+  def doHTTPGETRequest(url:String): String = {
+    val httpGet = new HttpGet(url)
+    doHTTPRequest(httpGet)
+
+  }
+
+  def doHTTPPOSTRequest(url:String, json:String): String = {
+    val httpPost = new HttpPost(url)
+    if (json != null) {
+      val entity = new StringEntity(json)
+      httpPost.setEntity(entity)
+      httpPost.setHeader("Accept", "application/json")
+      httpPost.setHeader("Content-type", "application/json")
+    }
+    doHTTPRequest(httpPost)
+  }
+
+  def hasNext: Boolean = {
+    buffer.nonEmpty && current_index < buffer.size
+  }
+
+
+  override def next(): String = {
+    val next_item:String = buffer(current_index)
+    current_index = current_index + 1
+    if (current_index == buffer.size)
+      getBufferData()
+    next_item
+  }
+
+
+
+
+  private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={
+    val client = HttpClients.createDefault
+    var tries = 4
+    try {
+      while (tries > 0) {
+
+        println(s"requesting ${r.getURI}")
+        val response = client.execute(r)
+        println(s"get response with status${response.getStatusLine.getStatusCode}")
+        if (response.getStatusLine.getStatusCode > 400) {
+          tries -= 1
+        }
+        else
+          return IOUtils.toString(response.getEntity.getContent)
+      }
+      ""
+    } catch {
+      case e: Throwable =>
+        throw new RuntimeException("Error on executing request ", e)
+    } finally try client.close()
+    catch {
+      case e: IOException =>
+        throw new RuntimeException("Unable to close client ", e)
+    }
+  }
+
+  getBufferData()
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala
@ -0,0 +1,31 @@
+package eu.dnetlib.dhp.actionmanager.datacite
+
+import org.json4s.{DefaultFormats, JValue}
+import org.json4s.jackson.JsonMethods.{compact, parse, render}
+
+class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
+
+  override def extractInfo(input: String): Unit = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: org.json4s.JValue = parse(input)
+    buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s)))
+    val next_url = (json \ "links" \ "next").extractOrElse[String](null)
+    scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None
+    if (scroll_value.isEmpty)
+      complete = true
+    current_index = 0
+  }
+
+  def get_url():String ={
+    val to = if (until> 0) s"$until" else "*"
+    s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
+
+  }
+
+  override def getBufferData(): Unit = {
+    if (!complete) {
+      val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
+      extractInfo(response)
+    }
+  }
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala
@ -0,0 +1,482 @@
+package eu.dnetlib.dhp.actionmanager.datacite
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.action.AtomicAction
+import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
+import eu.dnetlib.dhp.utils.DHPUtils
+import org.apache.commons.lang3.StringUtils
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.{JField, JObject, JString}
+import org.json4s.jackson.JsonMethods.parse
+
+import java.nio.charset.CodingErrorAction
+import java.text.SimpleDateFormat
+import java.time.LocalDate
+import java.time.format.DateTimeFormatter
+import java.util.{Date, Locale}
+import java.util.regex.Pattern
+import scala.collection.JavaConverters._
+import scala.io.{Codec, Source}
+
+
+
+case class DataciteType(doi:String,timestamp:Long,isActive:Boolean, json:String ){}
+
+case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
+
+case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
+
+case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
+
+case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
+
+case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
+
+case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
+
+case class DateType(date: Option[String], dateType: Option[String]) {}
+
+case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
+
+object DataciteToOAFTransformation {
+
+  implicit val codec: Codec = Codec("UTF-8")
+  codec.onMalformedInput(CodingErrorAction.REPLACE)
+  codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
+
+
+
+  private val PID_VOCABULARY = "dnet:pid_types"
+  val COBJ_VOCABULARY = "dnet:publication_resource"
+  val RESULT_VOCABULARY = "dnet:result_typologies"
+  val ACCESS_MODE_VOCABULARY = "dnet:access_modes"
+  val DOI_CLASS = "doi"
+
+  val TITLE_SCHEME = "dnet:dataCite_title"
+  val SUBJ_CLASS = "keywords"
+  val SUBJ_SCHEME = "dnet:subject_classification_typologies"
+
+  val j_filter:List[String] = {
+    val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
+    s.lines.toList
+  }
+
+  val mapper = new ObjectMapper()
+  val unknown_repository: HostedByMapType = HostedByMapType("openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18", "Unknown Repository", "Unknown Repository", Some(1.0F))
+
+  val dataInfo: DataInfo = generateDataInfo("0.9")
+  val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite")
+
+  val hostedByMap: Map[String, HostedByMapType] = {
+    val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: org.json4s.JValue = parse(s)
+    json.extract[Map[String, HostedByMapType]]
+  }
+
+  val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
+  val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
+
+  val funder_regex:List[(Pattern, String)] = List(
+    (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",  Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda__h2020::"),
+    (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",  Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda_______::")
+
+  )
+
+  val Date_regex: List[Pattern] = List(
+    //Y-M-D
+    Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
+    //M-D-Y
+    Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
+    //D-M-Y
+    Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
+    //Y
+    Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
+  )
+
+
+  def filter_json(json:String):Boolean = {
+    j_filter.exists(f => json.contains(f))
+  }
+
+  def toActionSet(item:Oaf) :(String, String) = {
+    val mapper = new ObjectMapper()
+
+    item match {
+      case dataset: OafDataset =>
+        val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
+        a.setClazz(classOf[OafDataset])
+        a.setPayload(dataset)
+        (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case publication: Publication =>
+        val a: AtomicAction[Publication] = new AtomicAction[Publication]
+        a.setClazz(classOf[Publication])
+        a.setPayload(publication)
+        (publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case software: Software =>
+        val a: AtomicAction[Software] = new AtomicAction[Software]
+        a.setClazz(classOf[Software])
+        a.setPayload(software)
+        (software.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case orp: OtherResearchProduct =>
+        val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
+        a.setClazz(classOf[OtherResearchProduct])
+        a.setPayload(orp)
+        (orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
+
+      case relation: Relation =>
+        val a: AtomicAction[Relation] = new AtomicAction[Relation]
+        a.setClazz(classOf[Relation])
+        a.setPayload(relation)
+        (relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
+      case _ =>
+        null
+    }
+
+  }
+
+
+
+
+  def embargo_end(embargo_end_date: String): Boolean = {
+    val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
+    val td = LocalDate.now()
+    td.isAfter(dt)
+  }
+
+
+  def extract_date(input: String): Option[String] = {
+    val d = Date_regex.map(pattern => {
+      val matcher = pattern.matcher(input)
+      if (matcher.find())
+        matcher.group(0)
+      else
+        null
+    }
+    ).find(s => s != null)
+
+    if (d.isDefined) {
+      val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
+      try {
+        return Some(LocalDate.parse(a_date, df_en).toString)
+      } catch {
+        case _: Throwable => try {
+          return Some(LocalDate.parse(a_date, df_it).toString)
+        } catch {
+          case _: Throwable =>
+            return None
+        }
+      }
+    }
+    d
+  }
+
+  def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): (Qualifier, Qualifier) = {
+    if (resourceType != null && resourceType.nonEmpty) {
+      val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceType)
+      if (typeQualifier != null)
+        return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid))
+    }
+    if (schemaOrg != null && schemaOrg.nonEmpty) {
+      val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, schemaOrg)
+      if (typeQualifier != null)
+        return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid))
+
+    }
+    if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
+      val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceTypeGeneral)
+      if (typeQualifier != null)
+        return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid))
+
+    }
+    null
+  }
+
+
+  def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): Result = {
+    val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
+    if (typeQualifiers == null)
+      return null
+    val i = new Instance
+    i.setInstancetype(typeQualifiers._1)
+    typeQualifiers._2.getClassname match {
+      case "dataset" =>
+        val r = new OafDataset
+        r.setInstance(List(i).asJava)
+        return r
+      case "publication" =>
+        val r = new Publication
+        r.setInstance(List(i).asJava)
+        return r
+      case "software" =>
+        val r = new Software
+        r.setInstance(List(i).asJava)
+        return r
+      case "other" =>
+        val r = new OtherResearchProduct
+        r.setInstance(List(i).asJava)
+        return r
+    }
+    null
+  }
+
+
+  def available_date(input: String): Boolean = {
+
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: org.json4s.JValue = parse(input)
+    val l: List[String] = for {
+      JObject(dates) <- json \\ "dates"
+      JField("dateType", JString(dateTypes)) <- dates
+    } yield dateTypes
+
+    l.exists(p => p.equalsIgnoreCase("available"))
+
+  }
+
+
+  def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
+    OafMapperUtils.structuredProperty(dt, q, null)
+  }
+
+  def generateRelation(sourceId:String, targetId:String, relClass:String, cf:KeyValue, di:DataInfo) :Relation = {
+
+    val r = new Relation
+    r.setSource(sourceId)
+    r.setTarget(targetId)
+    r.setRelType("resultProject")
+    r.setRelClass(relClass)
+    r.setSubRelType("outcome")
+    r.setCollectedfrom(List(cf).asJava)
+    r.setDataInfo(di)
+    r
+
+
+  }
+
+  def get_projectRelation(awardUri:String, sourceId:String):List[Relation] = {
+    val match_pattern = funder_regex.find(s =>s._1.matcher(awardUri).find())
+
+    if (match_pattern.isDefined) {
+      val m =match_pattern.get._1
+      val p = match_pattern.get._2
+      val grantId = m.matcher(awardUri).replaceAll("$2")
+      val targetId = s"$p${DHPUtils.md5(grantId)}"
+      List(
+        generateRelation(sourceId, targetId,"isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
+        generateRelation(targetId, sourceId,"produces", DATACITE_COLLECTED_FROM, dataInfo)
+      )
+    }
+    else
+      List()
+
+  }
+
+
+  def generateOAF(input:String,ts:Long, dateOfCollection:Long, vocabularies: VocabularyGroup):List[Oaf] = {
+    if (filter_json(input))
+      return  List()
+
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json = parse(input)
+
+    val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
+    val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
+    val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
+
+    val doi = (json \ "attributes" \ "doi").extract[String]
+    if (doi.isEmpty)
+      return List()
+
+    //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
+    val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
+    if (result == null)
+      return List()
+
+
+    val doi_q = vocabularies.getSynonymAsQualifier(PID_VOCABULARY, "doi")
+    val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
+    result.setPid(List(pid).asJava)
+    result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
+    result.setOriginalId(List(doi).asJava)
+
+    val d = new Date(dateOfCollection*1000)
+    val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
+
+
+    result.setDateofcollection(ISO8601FORMAT.format(d))
+    result.setDateoftransformation(ISO8601FORMAT.format(ts))
+    result.setDataInfo(dataInfo)
+
+    val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
+
+
+    val authors = creators.zipWithIndex.map { case (c, idx) =>
+      val a = new Author
+      a.setFullname(c.name.orNull)
+      a.setName(c.givenName.orNull)
+      a.setSurname(c.familyName.orNull)
+      if (c.nameIdentifiers!= null&& c.nameIdentifiers.isDefined  && c.nameIdentifiers.get != null) {
+        a.setPid(c.nameIdentifiers.get.map(ni => {
+          val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(PID_VOCABULARY, ni.nameIdentifierScheme.get.toLowerCase()) else null
+          if (ni.nameIdentifier!= null && ni.nameIdentifier.isDefined) {
+            OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
+          }
+          else
+            null
+
+        }
+        )
+          .asJava)
+      }
+      if (c.affiliation.isDefined)
+        a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
+      a.setRank(idx + 1)
+      a
+    }
+
+
+
+
+    val titles:List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
+
+    result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
+      if (t.titleType.isEmpty) {
+        OafMapperUtils.structuredProperty(t.title.get, "main title", "main title", TITLE_SCHEME, TITLE_SCHEME, null)
+      } else {
+        OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, TITLE_SCHEME, TITLE_SCHEME, null)
+      }
+    }).asJava)
+
+    if(authors==null || authors.isEmpty || !authors.exists(a => a !=null))
+      return List()
+    result.setAuthor(authors.asJava)
+
+    val dates = (json \\ "dates").extract[List[DateType]]
+    val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
+
+    val i_date = dates
+      .filter(d => d.date.isDefined && d.dateType.isDefined)
+      .find(d => d.dateType.get.equalsIgnoreCase("issued"))
+      .map(d => extract_date(d.date.get))
+    val a_date: Option[String] = dates
+      .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
+      .map(d => extract_date(d.date.get))
+      .find(d => d != null  && d.isDefined)
+      .map(d => d.get)
+
+    if (a_date.isDefined) {
+      result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
+    }
+    if (i_date.isDefined && i_date.get.isDefined) {
+      result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
+      result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
+    }
+    else if (publication_year != null) {
+      result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
+      result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
+    }
+
+
+    result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
+      .map(d => (extract_date(d.date.get), d.dateType.get))
+      .filter(d => d._1.isDefined)
+      .map(d => (d._1.get, vocabularies.getTermAsQualifier("dnet:dataCite_date", d._2.toLowerCase())))
+      .filter(d => d._2 != null)
+      .map(d => generateOAFDate(d._1, d._2)).asJava)
+
+    val subjects = (json \\ "subjects").extract[List[SubjectType]]
+
+    result.setSubject(subjects.filter(s => s.subject.nonEmpty)
+      .map(s =>
+        OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, SUBJ_SCHEME, SUBJ_SCHEME, null)
+      ).asJava)
+
+
+    result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
+
+    val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
+
+    result.setDescription(
+      descriptions
+        .filter(d => d.description.isDefined).
+        map(d =>
+          OafMapperUtils.field(d.description.get, null)
+        ).filter(s => s!=null).asJava)
+
+
+    val publisher = (json \\ "publisher").extractOrElse[String](null)
+    if (publisher != null)
+      result.setPublisher(OafMapperUtils.field(publisher, null))
+
+
+    val language: String = (json \\ "language").extractOrElse[String](null)
+
+    if (language != null)
+      result.setLanguage(vocabularies.getSynonymAsQualifier("dnet:languages", language))
+
+
+    val instance = result.getInstance().get(0)
+
+    val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
+
+    val accessRights:List[String] =  for {
+      JObject(rightsList) <- json \\ "rightsList"
+      JField("rightsUri", JString(rightsUri)) <- rightsList
+    } yield rightsUri
+
+    val aRights: Option[Qualifier] = accessRights.map(r => {
+      vocabularies.getSynonymAsQualifier(ACCESS_MODE_VOCABULARY, r)
+    }).find(q => q != null)
+
+
+    val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier("UNKNOWN", "not available", ACCESS_MODE_VOCABULARY, ACCESS_MODE_VOCABULARY)
+
+    if (client.isDefined) {
+      val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
+      instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
+      instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
+      instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
+//      instance.setAccessright(access_rights_qualifier)
+
+      //'http') and matches(., '.*(/licenses|/publicdomain|unlicense.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*')]">
+      val license = accessRights
+        .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
+      if (license.isDefined)
+        instance.setLicense(OafMapperUtils.field(license.get, null))
+    }
+
+
+    val awardUris:List[String] =  for {
+      JObject(fundingReferences) <- json \\ "fundingReferences"
+      JField("awardUri", JString(awardUri)) <- fundingReferences
+    } yield awardUri
+
+    val relations:List[Relation] =awardUris.flatMap(a=> get_projectRelation(a, result.getId)).filter(r => r!= null)
+
+    if (relations!= null && relations.nonEmpty) {
+      List(result):::relations
+    }
+    else
+      List(result)
+  }
+
+  def generateDataInfo(trust: String): DataInfo = {
+    val di = new DataInfo
+    di.setDeletedbyinference(false)
+    di.setInferred(false)
+    di.setInvisible(false)
+    di.setTrust(trust)
+    di.setProvenanceaction(OafMapperUtils.qualifier("sysimport:actionset", "sysimport:actionset", "dnet:provenanceActions", "dnet:provenanceActions"))
+    di
+  }
+
+  def generateDSId(input: String): String = {
+    val b = StringUtils.substringBefore(input, "::")
+    val a = StringUtils.substringAfter(input, "::")
+    s"10|$b::${DHPUtils.md5(a)}"
+  }
+
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala
@ -0,0 +1,41 @@
+package eu.dnetlib.dhp.actionmanager.datacite
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import org.apache.hadoop.io.Text
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.hadoop.mapred.SequenceFileOutputFormat
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+import scala.io.Source
+
+object ExportActionSetJobNode {
+
+  val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
+
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf
+    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
+    parser.parseArgument(args)
+    val master = parser.get("master")
+    val sourcePath = parser.get("sourcePath")
+    val targetPath = parser.get("targetPath")
+
+    val spark: SparkSession = SparkSession.builder().config(conf)
+      .appName(ExportActionSetJobNode.getClass.getSimpleName)
+      .master(master)
+      .getOrCreate()
+    implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+    implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
+
+    spark.read.load(sourcePath).as[Oaf]
+      .map(o =>DataciteToOAFTransformation.toActionSet(o))
+      .filter(o => o!= null)
+      .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
+
+
+  }
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala
@ -0,0 +1,48 @@
+package eu.dnetlib.dhp.actionmanager.datacite
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import eu.dnetlib.dhp.utils.ISLookupClientFactory
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+import scala.io.Source
+
+object GenerateDataciteDatasetSpark {
+
+  val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
+
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf
+    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
+    parser.parseArgument(args)
+    val master = parser.get("master")
+    val sourcePath = parser.get("sourcePath")
+    val targetPath = parser.get("targetPath")
+    val isLookupUrl: String = parser.get("isLookupUrl")
+    log.info("isLookupUrl: {}", isLookupUrl)
+
+    val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
+    val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
+    log.info(s"vocabulary size is ${vocabularies.getTerms("dnet:languages").size()}")
+    val spark: SparkSession = SparkSession.builder().config(conf)
+      .appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
+      .master(master)
+      .getOrCreate()
+
+    implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
+
+    implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
+
+    import spark.implicits._
+
+    spark.read.load(sourcePath).as[DataciteType]
+      .filter(d => d.isActive)
+      .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies))
+      .filter(d => d != null)
+      .write.mode(SaveMode.Overwrite).save(targetPath)
+  }
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala
@ -0,0 +1,181 @@
+package eu.dnetlib.dhp.actionmanager.datacite
+
+import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
+import org.apache.hadoop.hdfs.DistributedFileSystem
+import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.expressions.Aggregator
+import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods.parse
+import org.apache.spark.sql.functions.max
+import org.slf4j.{Logger, LoggerFactory}
+
+import java.time.format.DateTimeFormatter._
+import java.time.{LocalDate, LocalDateTime, ZoneOffset}
+import scala.io.Source
+
+object ImportDatacite {
+
+  val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
+
+
+  def convertAPIStringToDataciteItem(input: String): DataciteType = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: org.json4s.JValue = parse(input)
+    val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase
+
+    val isActive = (json \ "attributes" \ "isActive").extract[Boolean]
+
+    val timestamp_string = (json \ "attributes" \ "updated").extract[String]
+    val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
+    DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
+
+  }
+
+
+  def main(args: Array[String]): Unit = {
+
+    val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
+    parser.parseArgument(args)
+    val master = parser.get("master")
+
+    val hdfsuri = parser.get("namenode")
+    log.info(s"namenode is $hdfsuri")
+
+    val targetPath = parser.get("targetPath")
+    log.info(s"targetPath is $targetPath")
+
+    val dataciteDump = parser.get("dataciteDumpPath")
+    log.info(s"dataciteDump is $dataciteDump")
+
+    val hdfsTargetPath = new Path(targetPath)
+    log.info(s"hdfsTargetPath is $hdfsTargetPath")
+
+
+    val spkipImport = parser.get("skipImport")
+    log.info(s"skipImport is $spkipImport")
+
+    val spark: SparkSession = SparkSession.builder()
+      .appName(ImportDatacite.getClass.getSimpleName)
+      .master(master)
+      .getOrCreate()
+
+    // ====== Init HDFS File System Object
+    val conf = new Configuration
+    // Set FileSystem URI
+    conf.set("fs.defaultFS", hdfsuri)
+
+    // Because of Maven
+    conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName)
+    conf.set("fs.file.impl", classOf[LocalFileSystem].getName)
+    val sc: SparkContext = spark.sparkContext
+    sc.setLogLevel("ERROR")
+
+    import spark.implicits._
+
+
+    val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
+
+      override def zero: DataciteType = null
+
+      override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
+        if (b == null)
+          return a
+        if (a == null)
+          return b
+        if (a.timestamp > b.timestamp) {
+          return a
+        }
+        b
+      }
+
+      override def merge(a: DataciteType, b: DataciteType): DataciteType = {
+        reduce(a, b)
+      }
+
+      override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
+
+      override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
+
+      override def finish(reduction: DataciteType): DataciteType = reduction
+    }
+
+    val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
+    val ts = dump.select(max("timestamp")).first().getLong(0)
+
+    println(s"last Timestamp is $ts")
+
+    val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf)
+
+    println(s"Imported from Datacite API $cnt documents")
+
+    if (cnt > 0) {
+
+      val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
+        .map(s => s._2.toString)
+        .map(s => convertAPIStringToDataciteItem(s))
+      spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
+
+      val ds: Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType]
+
+      dump
+        .union(ds)
+        .groupByKey(_.doi)
+        .agg(dataciteAggregator.toColumn)
+        .map(s => s._2)
+        .repartition(4000)
+        .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
+
+      val fs = FileSystem.get(sc.hadoopConfiguration)
+      fs.delete(new Path(s"$dataciteDump"), true)
+      fs.rename(new Path(s"${dataciteDump}_updated"), new Path(s"$dataciteDump"))
+    }
+  }
+
+  private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = {
+    var from:Long = timestamp * 1000
+    val delta:Long = 50000000L
+    var client: DataciteAPIImporter = null
+    val now :Long =System.currentTimeMillis()
+    var i = 0
+    try {
+      val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
+      try {
+        var start: Long = System.currentTimeMillis
+        while (from < now) {
+          client = new DataciteAPIImporter(from, 1000, from + delta)
+          var end: Long = 0
+          val key: IntWritable = new IntWritable(i)
+          val value: Text = new Text
+          while (client.hasNext) {
+            key.set({
+              i += 1;
+              i - 1
+            })
+            value.set(client.next())
+            writer.append(key, value)
+            writer.hflush()
+            if (i % 1000 == 0) {
+              end = System.currentTimeMillis
+              val time = (end - start) / 1000.0F
+              println(s"Imported $i in $time seconds")
+              start = System.currentTimeMillis
+            }
+          }
+          println(s"updating from value: $from  -> ${from+delta}")
+          from = from + delta
+        }
+      } catch {
+        case e: Throwable =>
+          println("Error", e)
+      } finally if (writer != null) writer.close()
+    }
+    i
+  }
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java
@ -1,20 +0,0 @@
-
-package eu.dnetlib.dhp.actionmanager.project.httpconnector;
-
-import java.util.LinkedList;
-
-public class CollectorPluginErrorLogList extends LinkedList<String> {
-
-	private static final long serialVersionUID = -6925786561303289704L;
-
-	@Override
-	public String toString() {
-		String log = new String();
-		int index = 0;
-		for (String errorMessage : this) {
-			log += String.format("Retry #%s: %s / ", index++, errorMessage);
-		}
-		return log;
-	}
-
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java
@ -1,20 +0,0 @@
-
-package eu.dnetlib.dhp.actionmanager.project.httpconnector;
-
-public class CollectorServiceException extends Exception {
-
-	private static final long serialVersionUID = 7523999812098059764L;
-
-	public CollectorServiceException(String string) {
-		super(string);
-	}
-
-	public CollectorServiceException(String string, Throwable exception) {
-		super(string, exception);
-	}
-
-	public CollectorServiceException(Throwable exception) {
-		super(exception);
-	}
-
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java
@ -1,240 +0,0 @@
-
-package eu.dnetlib.dhp.actionmanager.project.httpconnector;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.*;
-import java.security.GeneralSecurityException;
-import java.security.cert.X509Certificate;
-import java.util.List;
-import java.util.Map;
-
-import javax.net.ssl.HttpsURLConnection;
-import javax.net.ssl.SSLContext;
-import javax.net.ssl.TrustManager;
-import javax.net.ssl.X509TrustManager;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.math.NumberUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-/**
- * @author jochen, michele, andrea
- */
-public class HttpConnector {
-
-	private static final Log log = LogFactory.getLog(HttpConnector.class);
-
-	private int maxNumberOfRetry = 6;
-	private int defaultDelay = 120; // seconds
-	private int readTimeOut = 120; // seconds
-
-	private String responseType = null;
-
-	private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
-
-	public HttpConnector() {
-		CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
-	}
-
-	/**
-	 * Given the URL returns the content via HTTP GET
-	 *
-	 * @param requestUrl the URL
-	 * @return the content of the downloaded resource
-	 * @throws CollectorServiceException when retrying more than maxNumberOfRetry times
-	 */
-	public String getInputSource(final String requestUrl) throws CollectorServiceException {
-		return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
-	}
-
-	/**
-	 * Given the URL returns the content as a stream via HTTP GET
-	 *
-	 * @param requestUrl the URL
-	 * @return the content of the downloaded resource as InputStream
-	 * @throws CollectorServiceException when retrying more than maxNumberOfRetry times
-	 */
-	public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
-		return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
-	}
-
-	private String attemptDownlaodAsString(final String requestUrl, final int retryNumber,
-		final CollectorPluginErrorLogList errorList)
-		throws CollectorServiceException {
-		try {
-			InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
-			try {
-				return IOUtils.toString(s);
-			} catch (IOException e) {
-				log.error("error while retrieving from http-connection occured: " + requestUrl, e);
-				Thread.sleep(defaultDelay * 1000);
-				errorList.add(e.getMessage());
-				return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
-			} finally {
-				IOUtils.closeQuietly(s);
-			}
-		} catch (InterruptedException e) {
-			throw new CollectorServiceException(e);
-		}
-	}
-
-	private InputStream attemptDownload(final String requestUrl, final int retryNumber,
-		final CollectorPluginErrorLogList errorList)
-		throws CollectorServiceException {
-
-		if (retryNumber > maxNumberOfRetry) {
-			throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList);
-		}
-
-		log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
-		try {
-			InputStream input = null;
-
-			try {
-				final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
-				urlConn.setInstanceFollowRedirects(false);
-				urlConn.setReadTimeout(readTimeOut * 1000);
-				urlConn.addRequestProperty("User-Agent", userAgent);
-
-				if (log.isDebugEnabled()) {
-					logHeaderFields(urlConn);
-				}
-
-				int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
-				if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
-					log.warn("waiting and repeating request after " + retryAfter + " sec.");
-					Thread.sleep(retryAfter * 1000);
-					errorList.add("503 Service Unavailable");
-					urlConn.disconnect();
-					return attemptDownload(requestUrl, retryNumber + 1, errorList);
-				} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM)
-					|| (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) {
-					final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
-					log.debug("The requested url has been moved to " + newUrl);
-					errorList
-						.add(
-							String
-								.format(
-									"%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(),
-									newUrl));
-					urlConn.disconnect();
-					return attemptDownload(newUrl, retryNumber + 1, errorList);
-				} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
-					log
-						.error(
-							String
-								.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
-					Thread.sleep(defaultDelay * 1000);
-					errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
-					urlConn.disconnect();
-					return attemptDownload(requestUrl, retryNumber + 1, errorList);
-				} else {
-					input = urlConn.getInputStream();
-					responseType = urlConn.getContentType();
-					return input;
-				}
-			} catch (IOException e) {
-				log.error("error while retrieving from http-connection occured: " + requestUrl, e);
-				Thread.sleep(defaultDelay * 1000);
-				errorList.add(e.getMessage());
-				return attemptDownload(requestUrl, retryNumber + 1, errorList);
-			}
-		} catch (InterruptedException e) {
-			throw new CollectorServiceException(e);
-		}
-	}
-
-	private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
-		log.debug("StatusCode: " + urlConn.getResponseMessage());
-
-		for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
-			if (e.getKey() != null) {
-				for (String v : e.getValue()) {
-					log.debug("  key: " + e.getKey() + " - value: " + v);
-				}
-			}
-		}
-	}
-
-	private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
-		for (String key : headerMap.keySet()) {
-			if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0)
-				&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
-				return Integer
-					.parseInt(headerMap.get(key).get(0)) + 10;
-			}
-		}
-		return -1;
-	}
-
-	private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
-		for (String key : headerMap.keySet()) {
-			if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) {
-				return headerMap.get(key).get(0);
-			}
-		}
-		throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
-	}
-
-	/**
-	 * register for https scheme; this is a workaround and not intended for the use in trusted environments
-	 */
-	public void initTrustManager() {
-		final X509TrustManager tm = new X509TrustManager() {
-
-			@Override
-			public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
-			}
-
-			@Override
-			public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
-			}
-
-			@Override
-			public X509Certificate[] getAcceptedIssuers() {
-				return null;
-			}
-		};
-		try {
-			final SSLContext ctx = SSLContext.getInstance("TLS");
-			ctx.init(null, new TrustManager[] {
-				tm
-			}, null);
-			HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
-		} catch (GeneralSecurityException e) {
-			log.fatal(e);
-			throw new IllegalStateException(e);
-		}
-	}
-
-	public int getMaxNumberOfRetry() {
-		return maxNumberOfRetry;
-	}
-
-	public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
-		this.maxNumberOfRetry = maxNumberOfRetry;
-	}
-
-	public int getDefaultDelay() {
-		return defaultDelay;
-	}
-
-	public void setDefaultDelay(final int defaultDelay) {
-		this.defaultDelay = defaultDelay;
-	}
-
-	public int getReadTimeOut() {
-		return readTimeOut;
-	}
-
-	public void setReadTimeOut(final int readTimeOut) {
-		this.readTimeOut = readTimeOut;
-	}
-
-	public String getResponseType() {
-		return responseType;
-	}
-
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java
@ -17,8 +17,8 @@ import org.apache.hadoop.fs.Path;

 import com.fasterxml.jackson.databind.ObjectMapper;

-import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.collection.HttpConnector2;

 /**
 * Applies the parsing of a csv file and writes the Serialization of it in hdfs
@ -74,7 +74,7 @@ public class ReadCSV implements Closeable {
 		throws Exception {
 		this.conf = new Configuration();
 		this.conf.set("fs.defaultFS", hdfsNameNode);
-		HttpConnector httpConnector = new HttpConnector();
+		HttpConnector2 httpConnector = new HttpConnector2();
 		FileSystem fileSystem = FileSystem.get(this.conf);
 		Path hdfsWritePath = new Path(hdfsPath);
 		FSDataOutputStream fsDataOutputStream = null;
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java
@ -14,13 +14,12 @@ import org.apache.hadoop.fs.Path;

 import com.fasterxml.jackson.databind.ObjectMapper;

-import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.collection.HttpConnector2;

 /**
 * Applies the parsing of an excel file and writes the Serialization of it in hdfs
 */
-
 public class ReadExcel implements Closeable {
 	private static final Log log = LogFactory.getLog(ReadCSV.class);
 	private final Configuration conf;
@ -72,7 +71,7 @@ public class ReadExcel implements Closeable {
 		throws Exception {
 		this.conf = new Configuration();
 		this.conf.set("fs.defaultFS", hdfsNameNode);
-		HttpConnector httpConnector = new HttpConnector();
+		HttpConnector2 httpConnector = new HttpConnector2();
 		FileSystem fileSystem = FileSystem.get(this.conf);
 		Path hdfsWritePath = new Path(hdfsPath);
 		FSDataOutputStream fsDataOutputStream = null;
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java
@ -0,0 +1,45 @@
+
+package eu.dnetlib.dhp.aggregation.common;
+
+import java.io.Serializable;
+
+import org.apache.spark.util.LongAccumulator;
+
+public class AggregationCounter implements Serializable {
+	private LongAccumulator totalItems;
+	private LongAccumulator errorItems;
+	private LongAccumulator processedItems;
+
+	public AggregationCounter() {
+	}
+
+	public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) {
+		this.totalItems = totalItems;
+		this.errorItems = errorItems;
+		this.processedItems = processedItems;
+	}
+
+	public LongAccumulator getTotalItems() {
+		return totalItems;
+	}
+
+	public void setTotalItems(LongAccumulator totalItems) {
+		this.totalItems = totalItems;
+	}
+
+	public LongAccumulator getErrorItems() {
+		return errorItems;
+	}
+
+	public void setErrorItems(LongAccumulator errorItems) {
+		this.errorItems = errorItems;
+	}
+
+	public LongAccumulator getProcessedItems() {
+		return processedItems;
+	}
+
+	public void setProcessedItems(LongAccumulator processedItems) {
+		this.processedItems = processedItems;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java
@ -0,0 +1,47 @@
+
+package eu.dnetlib.dhp.aggregation.common;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Objects;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+
+import eu.dnetlib.dhp.message.MessageSender;
+import eu.dnetlib.dhp.utils.DHPUtils;
+
+public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
+
+	private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class);
+
+	private MessageSender messageSender;
+
+	public AggregatorReport() {
+	}
+
+	public AggregatorReport(MessageSender messageSender) throws IOException {
+		this.messageSender = messageSender;
+	}
+
+	public void ongoing(Long current, Long total) {
+		messageSender.sendMessage(current, total);
+	}
+
+	@Override
+	public void close() throws IOException {
+		if (Objects.nonNull(messageSender)) {
+			log.info("closing report: ");
+			this.forEach((k, v) -> log.info("{} - {}", k, v));
+
+			Map<String, String> m = new HashMap<>();
+			m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values()));
+			messageSender.sendReport(m);
+		}
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java
@ -0,0 +1,10 @@
+
+package eu.dnetlib.dhp.aggregation.common;
+
+public interface ReporterCallback {
+
+	Long getCurrent();
+
+	Long getTotal();
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java
@ -0,0 +1,41 @@
+
+package eu.dnetlib.dhp.aggregation.common;
+
+import java.util.TimerTask;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+
+public abstract class ReportingJob {
+
+	/**
+	 * Frequency (seconds) for sending ongoing messages to report the collection task advancement
+	 */
+	public static final int ONGOING_REPORT_FREQUENCY = 5;
+
+	/**
+	 * Initial delay (seconds) for sending ongoing messages to report the collection task advancement
+	 */
+	public static final int INITIAL_DELAY = 2;
+
+	private ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
+
+	protected final AggregatorReport report;
+
+	public ReportingJob(AggregatorReport report) {
+		this.report = report;
+	}
+
+	protected void schedule(final ReporterCallback callback) {
+		executor.scheduleAtFixedRate(new TimerTask() {
+			@Override
+			public void run() {
+				report.ongoing(callback.getCurrent(), callback.getTotal());
+			}
+		}, INITIAL_DELAY, ONGOING_REPORT_FREQUENCY, TimeUnit.SECONDS);
+	}
+
+	protected void shutdown() {
+		executor.shutdown();
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java
@ -0,0 +1,136 @@
+
+package eu.dnetlib.dhp.aggregation.mdstore;
+
+import static eu.dnetlib.dhp.common.Constants.*;
+import static eu.dnetlib.dhp.utils.DHPUtils.*;
+
+import java.net.URI;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.common.rest.DNetRestClient;
+
+public class MDStoreActionNode {
+	private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);
+
+	enum MDAction {
+		NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
+	}
+
+	public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
+
+	public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
+	public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort";
+
+	public static final String READ_LOCK_URL = "%s/mdstore/%s/startReading";
+	public static final String READ_UNLOCK_URL = "%s/version/%s/endReading";
+
+	private static final String MDSTOREVERSIONPARAM = "mdStoreVersion";
+	private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion";
+
+	public static void main(String[] args) throws Exception {
+		final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					MDStoreActionNode.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
+		argumentParser.parseArgument(args);
+
+		log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
+
+		final MDAction action = MDAction.valueOf(argumentParser.get("action"));
+		log.info("Current action is {}", action);
+
+		final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
+		log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
+
+		switch (action) {
+			case NEW_VERSION: {
+				final String mdStoreID = argumentParser.get("mdStoreID");
+				if (StringUtils.isBlank(mdStoreID)) {
+					throw new IllegalArgumentException("missing or empty argument mdStoreId");
+				}
+				final MDStoreVersion currentVersion = DNetRestClient
+					.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
+				populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion));
+				break;
+			}
+			case COMMIT: {
+
+				final String hdfsuri = argumentParser.get("namenode");
+				if (StringUtils.isBlank(hdfsuri)) {
+					throw new IllegalArgumentException("missing or empty argument namenode");
+				}
+				final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
+				final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
+
+				if (StringUtils.isBlank(mdStoreVersion.getId())) {
+					throw new IllegalArgumentException(
+						"invalid MDStoreVersion value current is " + mdStoreVersion_params);
+				}
+				Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
+
+				try (
+					FileSystem fs = FileSystem.get(URI.create(hdfsuri), getHadoopConfiguration(hdfsuri));
+					FSDataInputStream inputStream = fs.open(hdfstoreSizepath)) {
+
+					final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream));
+
+					fs.create(hdfstoreSizepath);
+					DNetRestClient
+						.doGET(
+							String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize));
+				}
+
+				break;
+			}
+			case ROLLBACK: {
+				final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
+				final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
+
+				if (StringUtils.isBlank(mdStoreVersion.getId())) {
+					throw new IllegalArgumentException(
+						"invalid MDStoreVersion value current is " + mdStoreVersion_params);
+				}
+				DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId()));
+				break;
+			}
+
+			case READ_LOCK: {
+				final String mdStoreID = argumentParser.get("mdStoreID");
+				if (StringUtils.isBlank(mdStoreID)) {
+					throw new IllegalArgumentException("missing or empty argument mdStoreId");
+				}
+				final MDStoreVersion currentVersion = DNetRestClient
+					.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
+				populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion));
+				break;
+			}
+			case READ_UNLOCK: {
+				final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
+				final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
+
+				if (StringUtils.isBlank(mdStoreVersion.getId())) {
+					throw new IllegalArgumentException(
+						"invalid MDStoreVersion value current is " + mdStoreVersion_params);
+				}
+				DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId()));
+				break;
+			}
+
+			default:
+				throw new IllegalArgumentException("invalid action");
+		}
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java
@ -1,16 +1,16 @@

-package eu.dnetlib.dhp.collection.worker;
+package eu.dnetlib.dhp.collection;

-public class DnetCollectorException extends Exception {
+public class CollectorException extends Exception {

 	/** */
 	private static final long serialVersionUID = -290723075076039757L;

-	public DnetCollectorException() {
+	public CollectorException() {
 		super();
 	}

-	public DnetCollectorException(
+	public CollectorException(
 		final String message,
 		final Throwable cause,
 		final boolean enableSuppression,
@ -18,15 +18,15 @@ public class DnetCollectorException extends Exception {
 		super(message, cause, enableSuppression, writableStackTrace);
 	}

-	public DnetCollectorException(final String message, final Throwable cause) {
+	public CollectorException(final String message, final Throwable cause) {
 		super(message, cause);
 	}

-	public DnetCollectorException(final String message) {
+	public CollectorException(final String message) {
 		super(message);
 	}

-	public DnetCollectorException(final Throwable cause) {
+	public CollectorException(final Throwable cause) {
 		super(cause);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java
@ -0,0 +1,134 @@
+
+package eu.dnetlib.dhp.collection;
+
+import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME;
+
+import java.io.IOException;
+import java.util.Optional;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.DeflateCodec;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
+import eu.dnetlib.dhp.aggregation.common.ReportingJob;
+import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
+import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
+
+public class CollectorWorker extends ReportingJob {
+
+	private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class);
+
+	private final ApiDescriptor api;
+
+	private final FileSystem fileSystem;
+
+	private final MDStoreVersion mdStoreVersion;
+
+	private final HttpClientParams clientParams;
+
+	public CollectorWorker(
+		final ApiDescriptor api,
+		final FileSystem fileSystem,
+		final MDStoreVersion mdStoreVersion,
+		final HttpClientParams clientParams,
+		final AggregatorReport report) {
+		super(report);
+		this.api = api;
+		this.fileSystem = fileSystem;
+		this.mdStoreVersion = mdStoreVersion;
+		this.clientParams = clientParams;
+	}
+
+	public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
+
+		final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
+		log.info("outputPath path is {}", outputPath);
+
+		final CollectorPlugin plugin = getCollectorPlugin();
+		final AtomicInteger counter = new AtomicInteger(0);
+
+		scheduleReport(counter);
+
+		try (SequenceFile.Writer writer = SequenceFile
+			.createWriter(
+				fileSystem.getConf(),
+				SequenceFile.Writer.file(new Path(outputPath)),
+				SequenceFile.Writer.keyClass(IntWritable.class),
+				SequenceFile.Writer.valueClass(Text.class),
+				SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
+			final IntWritable key = new IntWritable(counter.get());
+			final Text value = new Text();
+			plugin
+				.collect(api, report)
+				.forEach(
+					content -> {
+						key.set(counter.getAndIncrement());
+						value.set(content);
+						try {
+							writer.append(key, value);
+						} catch (Throwable e) {
+							throw new RuntimeException(e);
+						}
+					});
+		} catch (Throwable e) {
+			report.put(e.getClass().getName(), e.getMessage());
+			throw new CollectorException(e);
+		} finally {
+			shutdown();
+			report.ongoing(counter.longValue(), counter.longValue());
+		}
+	}
+
+	private void scheduleReport(AtomicInteger counter) {
+		schedule(new ReporterCallback() {
+			@Override
+			public Long getCurrent() {
+				return counter.longValue();
+			}
+
+			@Override
+			public Long getTotal() {
+				return null;
+			}
+		});
+	}
+
+	private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
+
+		switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
+			case oai:
+				return new OaiCollectorPlugin(clientParams);
+			case rest_json2xml:
+				return new RestCollectorPlugin(clientParams);
+			case other:
+				final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
+					.ofNullable(api.getParams().get("other_plugin_type"))
+					.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
+					.get();
+
+				switch (plugin) {
+					case mdstore_mongodb_dump:
+						return new MongoDbDumpCollectorPlugin(fileSystem);
+					case mdstore_mongodb:
+						return new MDStoreCollectorPlugin();
+					default:
+						throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
+				}
+			default:
+				throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
+		}
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java
@ -0,0 +1,135 @@
+
+package eu.dnetlib.dhp.collection;
+
+import static eu.dnetlib.dhp.common.Constants.*;
+import static eu.dnetlib.dhp.utils.DHPUtils.*;
+
+import java.io.IOException;
+import java.util.Optional;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.fs.FileSystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.message.MessageSender;
+
+/**
+ * CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
+ * into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection
+ * oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the
+ * relative specific configurations
+ *
+ * @author Sandro La Bruzzo, Claudio Atzori
+ */
+public class CollectorWorkerApplication {
+
+	private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
+
+	private FileSystem fileSystem;
+
+	public CollectorWorkerApplication(FileSystem fileSystem) {
+		this.fileSystem = fileSystem;
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(final String[] args)
+		throws ParseException, IOException, UnknownCollectorPluginException, CollectorException {
+
+		final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					CollectorWorkerApplication.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json")));
+		argumentParser.parseArgument(args);
+
+		log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024));
+
+		final String hdfsuri = argumentParser.get("namenode");
+		log.info("hdfsURI is {}", hdfsuri);
+
+		final String apiDescriptor = argumentParser.get("apidescriptor");
+		log.info("apiDescriptor is {}", apiDescriptor);
+
+		final String mdStoreVersion = argumentParser.get("mdStoreVersion");
+		log.info("mdStoreVersion is {}", mdStoreVersion);
+
+		final String dnetMessageManagerURL = argumentParser.get(DNET_MESSAGE_MGR_URL);
+		log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
+
+		final String workflowId = argumentParser.get("workflowId");
+		log.info("workflowId is {}", workflowId);
+
+		final HttpClientParams clientParams = getClientParams(argumentParser);
+
+		final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class);
+		final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri));
+
+		new CollectorWorkerApplication(fileSystem)
+			.run(mdStoreVersion, clientParams, api, dnetMessageManagerURL, workflowId);
+	}
+
+	protected void run(String mdStoreVersion, HttpClientParams clientParams, ApiDescriptor api,
+		String dnetMessageManagerURL, String workflowId)
+		throws IOException, CollectorException, UnknownCollectorPluginException {
+
+		final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
+		final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId);
+
+		try (AggregatorReport report = new AggregatorReport(ms)) {
+			new CollectorWorker(api, fileSystem, currentVersion, clientParams, report).collect();
+		}
+	}
+
+	private static HttpClientParams getClientParams(ArgumentApplicationParser argumentParser) {
+		final HttpClientParams clientParams = new HttpClientParams();
+		clientParams
+			.setMaxNumberOfRetry(
+				Optional
+					.ofNullable(argumentParser.get(MAX_NUMBER_OF_RETRY))
+					.map(Integer::parseInt)
+					.orElse(HttpClientParams._maxNumberOfRetry));
+		log.info("maxNumberOfRetry is {}", clientParams.getMaxNumberOfRetry());
+
+		clientParams
+			.setRequestDelay(
+				Optional
+					.ofNullable(argumentParser.get(REQUEST_DELAY))
+					.map(Integer::parseInt)
+					.orElse(HttpClientParams._requestDelay));
+		log.info("requestDelay is {}", clientParams.getRequestDelay());
+
+		clientParams
+			.setRetryDelay(
+				Optional
+					.ofNullable(argumentParser.get(RETRY_DELAY))
+					.map(Integer::parseInt)
+					.orElse(HttpClientParams._retryDelay));
+		log.info("retryDelay is {}", clientParams.getRetryDelay());
+
+		clientParams
+			.setConnectTimeOut(
+				Optional
+					.ofNullable(argumentParser.get(CONNECT_TIMEOUT))
+					.map(Integer::parseInt)
+					.orElse(HttpClientParams._connectTimeOut));
+		log.info("connectTimeOut is {}", clientParams.getConnectTimeOut());
+
+		clientParams
+			.setReadTimeOut(
+				Optional
+					.ofNullable(argumentParser.get(READ_TIMEOUT))
+					.map(Integer::parseInt)
+					.orElse(HttpClientParams._readTimeOut));
+		log.info("readTimeOut is {}", clientParams.getReadTimeOut());
+		return clientParams;
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java
@ -1,28 +1,26 @@

 package eu.dnetlib.dhp.collection;

+import static eu.dnetlib.dhp.common.Constants.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.utils.DHPUtils.*;

 import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.nio.charset.StandardCharsets;
-import java.util.HashMap;
-import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;

-import org.apache.commons.cli.*;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Encoder;
-import org.apache.spark.sql.Encoders;
-import org.apache.spark.sql.SparkSession;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.expressions.Aggregator;
 import org.apache.spark.util.LongAccumulator;
 import org.dom4j.Document;
 import org.dom4j.Node;
@ -30,19 +28,172 @@ import org.dom4j.io.SAXReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import com.fasterxml.jackson.databind.ObjectMapper;
-
+import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
-import eu.dnetlib.dhp.model.mdstore.Provenance;
-import eu.dnetlib.message.Message;
-import eu.dnetlib.message.MessageManager;
-import eu.dnetlib.message.MessageType;
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
+import eu.dnetlib.dhp.schema.mdstore.Provenance;
+import scala.Tuple2;

 public class GenerateNativeStoreSparkJob {

 	private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class);

+	public static void main(String[] args) throws Exception {
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					GenerateNativeStoreSparkJob.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/collection/generate_native_input_parameters.json")));
+		parser.parseArgument(args);
+
+		final String provenanceArgument = parser.get("provenance");
+		log.info("Provenance is {}", provenanceArgument);
+		final Provenance provenance = MAPPER.readValue(provenanceArgument, Provenance.class);
+
+		final String dateOfCollectionArgs = parser.get("dateOfCollection");
+		log.info("dateOfCollection is {}", dateOfCollectionArgs);
+		final Long dateOfCollection = new Long(dateOfCollectionArgs);
+
+		String mdStoreVersion = parser.get("mdStoreVersion");
+		log.info("mdStoreVersion is {}", mdStoreVersion);
+
+		final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
+
+		String readMdStoreVersionParam = parser.get("readMdStoreVersion");
+		log.info("readMdStoreVersion is {}", readMdStoreVersionParam);
+
+		final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null
+			: MAPPER.readValue(readMdStoreVersionParam, MDStoreVersion.class);
+
+		final String xpath = parser.get("xpath");
+		log.info("xpath is {}", xpath);
+
+		final String encoding = parser.get("encoding");
+		log.info("encoding is {}", encoding);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> createNativeMDStore(
+				spark, provenance, dateOfCollection, xpath, encoding, currentVersion, readMdStoreVersion));
+	}
+
+	private static void createNativeMDStore(SparkSession spark,
+		Provenance provenance,
+		Long dateOfCollection,
+		String xpath,
+		String encoding,
+		MDStoreVersion currentVersion,
+		MDStoreVersion readVersion) throws IOException {
+		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
+
+		final LongAccumulator totalItems = sc.sc().longAccumulator(CONTENT_TOTALITEMS);
+		final LongAccumulator invalidRecords = sc.sc().longAccumulator(CONTENT_INVALIDRECORDS);
+
+		final String seqFilePath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
+		final JavaRDD<MetadataRecord> nativeStore = sc
+			.sequenceFile(seqFilePath, IntWritable.class, Text.class)
+			.map(
+				item -> parseRecord(
+					item._2().toString(),
+					xpath,
+					encoding,
+					provenance,
+					dateOfCollection,
+					totalItems,
+					invalidRecords))
+			.filter(Objects::nonNull)
+			.distinct();
+
+		final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
+		final Dataset<MetadataRecord> mdstore = spark.createDataset(nativeStore.rdd(), encoder);
+
+		final String targetPath = currentVersion.getHdfsPath() + MDSTORE_DATA_PATH;
+
+		if (readVersion != null) { // INCREMENTAL MODE
+			log.info("updating {} incrementally with {}", targetPath, readVersion.getHdfsPath());
+			Dataset<MetadataRecord> currentMdStoreVersion = spark
+				.read()
+				.load(readVersion.getHdfsPath() + MDSTORE_DATA_PATH)
+				.as(encoder);
+			TypedColumn<MetadataRecord, MetadataRecord> aggregator = new MDStoreAggregator().toColumn();
+
+			final Dataset<MetadataRecord> map = currentMdStoreVersion
+				.union(mdstore)
+				.groupByKey(
+					(MapFunction<MetadataRecord, String>) MetadataRecord::getId,
+					Encoders.STRING())
+				.agg(aggregator)
+				.map((MapFunction<Tuple2<String, MetadataRecord>, MetadataRecord>) Tuple2::_2, encoder);
+
+			map.select("id").takeAsList(100).forEach(s -> log.info(s.toString()));
+
+			saveDataset(map, targetPath);
+
+		} else {
+			saveDataset(mdstore, targetPath);
+		}
+
+		final Long total = spark.read().load(targetPath).count();
+		log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName());
+
+		writeHdfsFile(
+			spark.sparkContext().hadoopConfiguration(), total.toString(),
+			currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH);
+	}
+
+	public static class MDStoreAggregator extends Aggregator<MetadataRecord, MetadataRecord, MetadataRecord> {
+
+		@Override
+		public MetadataRecord zero() {
+			return null;
+		}
+
+		@Override
+		public MetadataRecord reduce(MetadataRecord b, MetadataRecord a) {
+			return getLatestRecord(b, a);
+		}
+
+		@Override
+		public MetadataRecord merge(MetadataRecord b, MetadataRecord a) {
+			return getLatestRecord(b, a);
+		}
+
+		private MetadataRecord getLatestRecord(MetadataRecord b, MetadataRecord a) {
+			if (b == null)
+				return a;
+
+			if (a == null)
+				return b;
+			return (a.getDateOfCollection() > b.getDateOfCollection()) ? a : b;
+		}
+
+		@Override
+		public MetadataRecord finish(MetadataRecord r) {
+			return r;
+		}
+
+		@Override
+		public Encoder<MetadataRecord> bufferEncoder() {
+			return Encoders.bean(MetadataRecord.class);
+		}
+
+		@Override
+		public Encoder<MetadataRecord> outputEncoder() {
+			return Encoders.bean(MetadataRecord.class);
+		}
+
+	}
+
 	public static MetadataRecord parseRecord(
 		final String input,
 		final String xpath,
@ -64,112 +215,11 @@ public class GenerateNativeStoreSparkJob {
 					invalidRecords.add(1);
 				return null;
 			}
-			return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection);
+			return new MetadataRecord(originalIdentifier, encoding, provenance, document.asXML(), dateOfCollection);
 		} catch (Throwable e) {
-			if (invalidRecords != null)
-				invalidRecords.add(1);
-			e.printStackTrace();
+			invalidRecords.add(1);
 			return null;
 		}
 	}

-	public static void main(String[] args) throws Exception {
-
-		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					GenerateNativeStoreSparkJob.class
-						.getResourceAsStream(
-							"/eu/dnetlib/dhp/collection/collection_input_parameters.json")));
-		parser.parseArgument(args);
-		final ObjectMapper jsonMapper = new ObjectMapper();
-		final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class);
-		final long dateOfCollection = new Long(parser.get("dateOfCollection"));
-
-		Boolean isSparkSessionManaged = Optional
-			.ofNullable(parser.get("isSparkSessionManaged"))
-			.map(Boolean::valueOf)
-			.orElse(Boolean.TRUE);
-		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
-
-		final Map<String, String> ongoingMap = new HashMap<>();
-		final Map<String, String> reportMap = new HashMap<>();
-
-		final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
-
-		SparkConf conf = new SparkConf();
-		runWithSparkSession(
-			conf,
-			isSparkSessionManaged,
-			spark -> {
-				final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-
-				final JavaPairRDD<IntWritable, Text> inputRDD = sc
-					.sequenceFile(parser.get("input"), IntWritable.class, Text.class);
-
-				final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems");
-				final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords");
-
-				final MessageManager manager = new MessageManager(
-					parser.get("rabbitHost"),
-					parser.get("rabbitUser"),
-					parser.get("rabbitPassword"),
-					false,
-					false,
-					null);
-
-				final JavaRDD<MetadataRecord> mappeRDD = inputRDD
-					.map(
-						item -> parseRecord(
-							item._2().toString(),
-							parser.get("xpath"),
-							parser.get("encoding"),
-							provenance,
-							dateOfCollection,
-							totalItems,
-							invalidRecords))
-					.filter(Objects::nonNull)
-					.distinct();
-
-				ongoingMap.put("ongoing", "0");
-				if (!test) {
-					manager
-						.sendMessage(
-							new Message(
-								parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
-							parser.get("rabbitOngoingQueue"),
-							true,
-							false);
-				}
-
-				final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
-				final Dataset<MetadataRecord> mdstore = spark.createDataset(mappeRDD.rdd(), encoder);
-				final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords");
-				mdStoreRecords.add(mdstore.count());
-				ongoingMap.put("ongoing", "" + totalItems.value());
-				if (!test) {
-					manager
-						.sendMessage(
-							new Message(
-								parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap),
-							parser.get("rabbitOngoingQueue"),
-							true,
-							false);
-				}
-				mdstore.write().format("parquet").save(parser.get("output"));
-				reportMap.put("inputItem", "" + totalItems.value());
-				reportMap.put("invalidRecords", "" + invalidRecords.value());
-				reportMap.put("mdStoreSize", "" + mdStoreRecords.value());
-				if (!test) {
-					manager
-						.sendMessage(
-							new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
-							parser.get("rabbitReportQueue"),
-							true,
-							false);
-					manager.close();
-				}
-			});
-
-	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpClientParams.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpClientParams.java
@ -0,0 +1,94 @@
+
+package eu.dnetlib.dhp.collection;
+
+/**
+ * Bundles the http connection parameters driving the client behaviour.
+ */
+public class HttpClientParams {
+
+	// Defaults
+	public static int _maxNumberOfRetry = 3;
+	public static int _requestDelay = 0; // milliseconds
+	public static int _retryDelay = 10; // seconds
+	public static int _connectTimeOut = 10; // seconds
+	public static int _readTimeOut = 30; // seconds
+
+	/**
+	 * Maximum number of allowed retires before failing
+	 */
+	private int maxNumberOfRetry;
+
+	/**
+	 * Delay between request (Milliseconds)
+	 */
+	private int requestDelay;
+
+	/**
+	 * Time to wait after a failure before retrying (Seconds)
+	 */
+	private int retryDelay;
+
+	/**
+	 * Connect timeout (Seconds)
+	 */
+	private int connectTimeOut;
+
+	/**
+	 * Read timeout (Seconds)
+	 */
+	private int readTimeOut;
+
+	public HttpClientParams() {
+		this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut);
+	}
+
+	public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
+		int readTimeOut) {
+		this.maxNumberOfRetry = maxNumberOfRetry;
+		this.requestDelay = requestDelay;
+		this.retryDelay = retryDelay;
+		this.connectTimeOut = connectTimeOut;
+		this.readTimeOut = readTimeOut;
+	}
+
+	public int getMaxNumberOfRetry() {
+		return maxNumberOfRetry;
+	}
+
+	public void setMaxNumberOfRetry(int maxNumberOfRetry) {
+		this.maxNumberOfRetry = maxNumberOfRetry;
+	}
+
+	public int getRequestDelay() {
+		return requestDelay;
+	}
+
+	public void setRequestDelay(int requestDelay) {
+		this.requestDelay = requestDelay;
+	}
+
+	public int getRetryDelay() {
+		return retryDelay;
+	}
+
+	public void setRetryDelay(int retryDelay) {
+		this.retryDelay = retryDelay;
+	}
+
+	public void setConnectTimeOut(int connectTimeOut) {
+		this.connectTimeOut = connectTimeOut;
+	}
+
+	public int getConnectTimeOut() {
+		return connectTimeOut;
+	}
+
+	public int getReadTimeOut() {
+		return readTimeOut;
+	}
+
+	public void setReadTimeOut(int readTimeOut) {
+		this.readTimeOut = readTimeOut;
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java
@ -0,0 +1,259 @@
+
+package eu.dnetlib.dhp.collection;
+
+import static eu.dnetlib.dhp.utils.DHPUtils.*;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.*;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.math.NumberUtils;
+import org.apache.http.HttpHeaders;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+
+/**
+ * Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java
+ *
+ * @author jochen, michele, andrea, alessia, claudio
+ */
+public class HttpConnector2 {
+
+	private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
+
+	private static final String REPORT_PREFIX = "http:";
+
+	private HttpClientParams clientParams;
+
+	private String responseType = null;
+
+	private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
+
+	public HttpConnector2() {
+		this(new HttpClientParams());
+	}
+
+	public HttpConnector2(HttpClientParams clientParams) {
+		this.clientParams = clientParams;
+		CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
+	}
+
+	/**
+	 * @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
+	 */
+	public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
+		return IOUtils.toInputStream(getInputSource(requestUrl));
+	}
+
+	/**
+	 * @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
+	 */
+	public String getInputSource(final String requestUrl) throws CollectorException {
+		return attemptDownloadAsString(requestUrl, 1, new AggregatorReport());
+	}
+
+	/**
+	 * Given the URL returns the content via HTTP GET
+	 *
+	 * @param requestUrl the URL
+	 * @param report the list of errors
+	 * @return the content of the downloaded resource
+	 * @throws CollectorException when retrying more than maxNumberOfRetry times
+	 */
+	public String getInputSource(final String requestUrl, AggregatorReport report)
+		throws CollectorException {
+		return attemptDownloadAsString(requestUrl, 1, report);
+	}
+
+	private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
+		final AggregatorReport report) throws CollectorException {
+
+		try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
+			return IOUtils.toString(s);
+		} catch (IOException e) {
+			log.error(e.getMessage(), e);
+			throw new CollectorException(e);
+		}
+	}
+
+	private InputStream attemptDownload(final String requestUrl, final int retryNumber,
+		final AggregatorReport report) throws CollectorException, IOException {
+
+		if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
+			final String msg = String
+				.format(
+					"Max number of retries (%s/%s) exceeded, failing.",
+					retryNumber, getClientParams().getMaxNumberOfRetry());
+			log.error(msg);
+			throw new CollectorException(msg);
+		}
+
+		log.info("Request attempt {} [{}]", retryNumber, requestUrl);
+
+		InputStream input = null;
+
+		try {
+			if (getClientParams().getRequestDelay() > 0) {
+				backoffAndSleep(getClientParams().getRequestDelay());
+			}
+			final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
+			urlConn.setInstanceFollowRedirects(false);
+			urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
+			urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
+			urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
+
+			if (log.isDebugEnabled()) {
+				logHeaderFields(urlConn);
+			}
+
+			int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
+			if (is2xx(urlConn.getResponseCode())) {
+				input = urlConn.getInputStream();
+				responseType = urlConn.getContentType();
+				return input;
+			}
+			if (is3xx(urlConn.getResponseCode())) {
+				// REDIRECTS
+				final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
+				log.info(String.format("The requested url has been moved to %s", newUrl));
+				report
+					.put(
+						REPORT_PREFIX + urlConn.getResponseCode(),
+						String.format("Moved to: %s", newUrl));
+				urlConn.disconnect();
+				if (retryAfter > 0) {
+					backoffAndSleep(retryAfter);
+				}
+				return attemptDownload(newUrl, retryNumber + 1, report);
+			}
+			if (is4xx(urlConn.getResponseCode())) {
+				// CLIENT ERROR, DO NOT RETRY
+				report
+					.put(
+						REPORT_PREFIX + urlConn.getResponseCode(),
+						String
+							.format(
+								"%s error: %s", requestUrl, urlConn.getResponseMessage()));
+				throw new CollectorException("4xx error: request will not be repeated. " + report);
+			}
+			if (is5xx(urlConn.getResponseCode())) {
+				// SERVER SIDE ERRORS RETRY ONLY on 503
+				switch (urlConn.getResponseCode()) {
+					case HttpURLConnection.HTTP_UNAVAILABLE:
+						if (retryAfter > 0) {
+							log
+								.warn(
+									requestUrl + " - waiting and repeating request after suggested retry-after "
+										+ retryAfter + " sec.");
+							backoffAndSleep(retryAfter * 1000);
+						} else {
+							log
+								.warn(
+									requestUrl + " - waiting and repeating request after default delay of "
+										+ getClientParams().getRetryDelay() + " sec.");
+							backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
+						}
+						report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
+						urlConn.disconnect();
+						return attemptDownload(requestUrl, retryNumber + 1, report);
+					default:
+						report
+							.put(
+								REPORT_PREFIX + urlConn.getResponseCode(),
+								String
+									.format(
+										"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
+						throw new CollectorException(urlConn.getResponseCode() + " error " + report);
+				}
+			}
+			throw new CollectorException(
+				String
+					.format(
+						"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
+						MAPPER.writeValueAsString(report)));
+		} catch (MalformedURLException | UnknownHostException e) {
+			log.error(e.getMessage(), e);
+			report.put(e.getClass().getName(), e.getMessage());
+			throw new CollectorException(e.getMessage(), e);
+		} catch (SocketTimeoutException | SocketException e) {
+			log.error(e.getMessage(), e);
+			report.put(e.getClass().getName(), e.getMessage());
+			backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
+			return attemptDownload(requestUrl, retryNumber + 1, report);
+		}
+	}
+
+	private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
+		log.debug("StatusCode: " + urlConn.getResponseMessage());
+
+		for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
+			if (e.getKey() != null) {
+				for (String v : e.getValue()) {
+					log.debug("  key: " + e.getKey() + " - value: " + v);
+				}
+			}
+		}
+	}
+
+	private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
+		log.info("I'm going to sleep for {}ms", sleepTimeMs);
+		try {
+			Thread.sleep(sleepTimeMs);
+		} catch (InterruptedException e) {
+			log.error(e.getMessage(), e);
+			throw new CollectorException(e);
+		}
+	}
+
+	private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
+		for (String key : headerMap.keySet()) {
+			if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (headerMap.get(key).size() > 0)
+				&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
+				return Integer.parseInt(headerMap.get(key).get(0)) + 10;
+			}
+		}
+		return -1;
+	}
+
+	private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
+		for (String key : headerMap.keySet()) {
+			if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
+				return headerMap.get(key).get(0);
+			}
+		}
+		throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
+	}
+
+	private boolean is2xx(final int statusCode) {
+		return statusCode >= 200 && statusCode <= 299;
+	}
+
+	private boolean is4xx(final int statusCode) {
+		return statusCode >= 400 && statusCode <= 499;
+	}
+
+	private boolean is3xx(final int statusCode) {
+		return statusCode >= 300 && statusCode <= 399;
+	}
+
+	private boolean is5xx(final int statusCode) {
+		return statusCode >= 500 && statusCode <= 599;
+	}
+
+	public String getResponseType() {
+		return responseType;
+	}
+
+	public HttpClientParams getClientParams() {
+		return clientParams;
+	}
+
+	public void setClientParams(HttpClientParams clientParams) {
+		this.clientParams = clientParams;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java
@ -0,0 +1,84 @@
+
+package eu.dnetlib.dhp.collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class JsonUtils {
+
+	private static final Log log = LogFactory.getLog(JsonUtils.class);
+
+	public static final String wrapName = "recordWrap";
+
+	/**
+	 * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
+	 * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
+	 * and work-around for the JSON to XML converting of org.json.XML-package.
+	 *
+	 * known bugs:     doesn't prevent     "key name":" ["sexy name",": penari","erotic dance"],
+	 *
+	 * @param jsonInput
+	 * @return convertedJsonKeynameOutput
+	 */
+	public String syntaxConvertJsonKeyNames(String jsonInput) {
+
+		log.trace("before convertJsonKeyNames: " + jsonInput);
+		// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
+		// replace ' 's in JSON Namens with '_'
+		while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
+			jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
+		}
+
+		// replace forward-slash (sign '/' ) in JSON Names with '_'
+		while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
+			jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
+		}
+
+		// replace '(' in JSON Names with ''
+		while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
+			jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
+		}
+
+		// replace ')' in JSON Names with ''
+		while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
+			jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
+		}
+
+		// add prefix of startNumbers in JSON Keynames with 'n_'
+		while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
+			jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
+		}
+		// add prefix of only numbers in JSON Keynames with 'm_'
+		while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
+			jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
+		}
+
+		// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
+		while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
+			jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
+		}
+
+		// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
+		// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
+		// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
+		// }
+
+		// replace '=' in JSON Keynames with '-'
+		while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
+			jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
+		}
+
+		log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
+		return jsonInput;
+	}
+
+	public String convertToXML(final String jsonRecord) {
+		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+		org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
+		resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
+		log.trace("before inputStream: " + resultXml);
+		resultXml = XmlCleaner.cleanAllEntities(resultXml);
+		log.trace("after cleaning: " + resultXml);
+		return resultXml;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/UnknownCollectorPluginException.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/UnknownCollectorPluginException.java
@ -0,0 +1,32 @@
+
+package eu.dnetlib.dhp.collection;
+
+public class UnknownCollectorPluginException extends Exception {
+
+	/** */
+	private static final long serialVersionUID = -290723075076039757L;
+
+	public UnknownCollectorPluginException() {
+		super();
+	}
+
+	public UnknownCollectorPluginException(
+		final String message,
+		final Throwable cause,
+		final boolean enableSuppression,
+		final boolean writableStackTrace) {
+		super(message, cause, enableSuppression, writableStackTrace);
+	}
+
+	public UnknownCollectorPluginException(final String message, final Throwable cause) {
+		super(message, cause);
+	}
+
+	public UnknownCollectorPluginException(final String message) {
+		super(message);
+	}
+
+	public UnknownCollectorPluginException(final Throwable cause) {
+		super(cause);
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java
@ -1,5 +1,5 @@

-package eu.dnetlib.dhp.collection.worker.utils;
+package eu.dnetlib.dhp.collection;

 import java.util.HashMap;
 import java.util.HashSet;
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java
@ -3,10 +3,21 @@ package eu.dnetlib.dhp.collection.plugin;

 import java.util.stream.Stream;

-import eu.dnetlib.collector.worker.model.ApiDescriptor;
-import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.collection.CollectorException;

 public interface CollectorPlugin {

-	Stream<String> collect(ApiDescriptor api) throws DnetCollectorException;
+	enum NAME {
+		oai, other, rest_json2xml;
+
+		public enum OTHER_NAME {
+			mdstore_mongodb_dump, mdstore_mongodb
+		}
+
+	}
+
+	Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException;
+
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java
@ -0,0 +1,58 @@
+
+package eu.dnetlib.dhp.collection.plugin.mongodb;
+
+import java.util.Optional;
+import java.util.Spliterator;
+import java.util.Spliterators;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import org.bson.Document;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.mongodb.client.MongoCollection;
+
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.collection.CollectorException;
+import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
+import eu.dnetlib.dhp.common.MdstoreClient;
+
+public class MDStoreCollectorPlugin implements CollectorPlugin {
+
+	private static final Logger log = LoggerFactory.getLogger(MDStoreCollectorPlugin.class);
+
+	public static final String MONGODB_DBNAME = "mongodb_dbname";
+	public static final String MDSTORE_ID = "mdstore_id";
+
+	@Override
+	public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
+
+		final String mongoBaseUrl = Optional
+			.ofNullable(api.getBaseUrl())
+			.orElseThrow(
+				() -> new CollectorException(
+					"missing mongodb baseUrl, expected in eu.dnetlib.dhp.collection.ApiDescriptor.baseUrl"));
+		log.info("mongoBaseUrl: {}", mongoBaseUrl);
+
+		final String dbName = Optional
+			.ofNullable(api.getParams().get(MONGODB_DBNAME))
+			.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME)));
+		log.info("dbName: {}", dbName);
+
+		final String mdId = Optional
+			.ofNullable(api.getParams().get(MDSTORE_ID))
+			.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MDSTORE_ID)));
+		log.info("mdId: {}", mdId);
+
+		final MdstoreClient client = new MdstoreClient(mongoBaseUrl, dbName);
+		final MongoCollection<Document> mdstore = client.mdStore(mdId);
+		long size = mdstore.count();
+
+		return StreamSupport
+			.stream(
+				Spliterators.spliterator(mdstore.find().iterator(), size, Spliterator.SIZED), false)
+			.map(doc -> doc.getString("body"));
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java
@ -0,0 +1,54 @@
+
+package eu.dnetlib.dhp.collection.plugin.mongodb;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.Optional;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.collection.CollectorException;
+import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
+import eu.dnetlib.dhp.utils.DHPUtils;
+
+public class MongoDbDumpCollectorPlugin implements CollectorPlugin {
+
+	public static final String PATH_PARAM = "path";
+	public static final String BODY_JSONPATH = "$.body";
+
+	public FileSystem fileSystem;
+
+	public MongoDbDumpCollectorPlugin(FileSystem fileSystem) {
+		this.fileSystem = fileSystem;
+	}
+
+	@Override
+	public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
+
+		final Path path = Optional
+			.ofNullable(api.getParams().get("path"))
+			.map(Path::new)
+			.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", PATH_PARAM)));
+
+		try {
+			if (!fileSystem.exists(path)) {
+				throw new CollectorException("path does not exist: " + path.toString());
+			}
+
+			return new BufferedReader(
+				new InputStreamReader(new GZIPInputStream(fileSystem.open(path)), Charset.defaultCharset()))
+					.lines()
+					.map(s -> DHPUtils.getJPathString(BODY_JSONPATH, s));
+
+		} catch (IOException e) {
+			throw new CollectorException(e);
+		}
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java
@ -13,9 +13,11 @@ import com.google.common.base.Splitter;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.Lists;

-import eu.dnetlib.collector.worker.model.ApiDescriptor;
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.collection.CollectorException;
+import eu.dnetlib.dhp.collection.HttpClientParams;
 import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
-import eu.dnetlib.dhp.collection.worker.DnetCollectorException;

 public class OaiCollectorPlugin implements CollectorPlugin {

@ -26,8 +28,15 @@ public class OaiCollectorPlugin implements CollectorPlugin {

 	private OaiIteratorFactory oaiIteratorFactory;

+	private HttpClientParams clientParams;
+
+	public OaiCollectorPlugin(HttpClientParams clientParams) {
+		this.clientParams = clientParams;
+	}
+
 	@Override
-	public Stream<String> collect(final ApiDescriptor api) throws DnetCollectorException {
+	public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
+		throws CollectorException {
 		final String baseUrl = api.getBaseUrl();
 		final String mdFormat = api.getParams().get(FORMAT_PARAM);
 		final String setParam = api.getParams().get(OAI_SET_PARAM);
@ -46,26 +55,26 @@ public class OaiCollectorPlugin implements CollectorPlugin {
 		}

 		if (baseUrl == null || baseUrl.isEmpty()) {
-			throw new DnetCollectorException("Param 'baseurl' is null or empty");
+			throw new CollectorException("Param 'baseurl' is null or empty");
 		}

 		if (mdFormat == null || mdFormat.isEmpty()) {
-			throw new DnetCollectorException("Param 'mdFormat' is null or empty");
+			throw new CollectorException("Param 'mdFormat' is null or empty");
 		}

 		if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
-			throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
+			throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
 		}

 		if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
-			throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
+			throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
 		}

 		final Iterator<Iterator<String>> iters = sets
 			.stream()
 			.map(
 				set -> getOaiIteratorFactory()
-					.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
+					.newIterator(baseUrl, mdFormat, set, fromDate, untilDate, getClientParams(), report))
 			.iterator();

 		return StreamSupport
@ -79,4 +88,12 @@ public class OaiCollectorPlugin implements CollectorPlugin {
 		}
 		return oaiIteratorFactory;
 	}
+
+	public HttpClientParams getClientParams() {
+		return clientParams;
+	}
+
+	public void setClientParams(HttpClientParams clientParams) {
+		this.clientParams = clientParams;
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java
@ -1,7 +1,9 @@

 package eu.dnetlib.dhp.collection.plugin.oai;

+import java.io.IOException;
 import java.io.StringReader;
+import java.io.StringWriter;
 import java.io.UnsupportedEncodingException;
 import java.net.URLEncoder;
 import java.util.Iterator;
@ -9,24 +11,28 @@ import java.util.Queue;
 import java.util.concurrent.PriorityBlockingQueue;

 import org.apache.commons.lang.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.dom4j.Document;
 import org.dom4j.DocumentException;
+import org.dom4j.DocumentHelper;
 import org.dom4j.Node;
+import org.dom4j.io.OutputFormat;
 import org.dom4j.io.SAXReader;
+import org.dom4j.io.XMLWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

-import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
-import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
-import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner;
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.collection.CollectorException;
+import eu.dnetlib.dhp.collection.HttpConnector2;
+import eu.dnetlib.dhp.collection.XmlCleaner;

 public class OaiIterator implements Iterator<String> {

-	private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on
-	// 11/24/08 5:02 PM
+	private static final Logger log = LoggerFactory.getLogger(OaiIterator.class);
+
+	private final static String REPORT_PREFIX = "oai:";

 	private final Queue<String> queue = new PriorityBlockingQueue<>();
-	private final SAXReader reader = new SAXReader();

 	private final String baseUrl;
 	private final String set;
@ -35,7 +41,8 @@ public class OaiIterator implements Iterator<String> {
 	private final String untilDate;
 	private String token;
 	private boolean started;
-	private final HttpConnector httpConnector;
+	private final HttpConnector2 httpConnector;
+	private AggregatorReport report;

 	public OaiIterator(
 		final String baseUrl,
@ -43,7 +50,8 @@ public class OaiIterator implements Iterator<String> {
 		final String set,
 		final String fromDate,
 		final String untilDate,
-		final HttpConnector httpConnector) {
+		final HttpConnector2 httpConnector,
+		final AggregatorReport report) {
 		this.baseUrl = baseUrl;
 		this.mdFormat = mdFormat;
 		this.set = set;
@ -51,6 +59,7 @@ public class OaiIterator implements Iterator<String> {
 		this.untilDate = untilDate;
 		this.started = false;
 		this.httpConnector = httpConnector;
+		this.report = report;
 	}

 	private void verifyStarted() {
@ -58,7 +67,7 @@ public class OaiIterator implements Iterator<String> {
 			this.started = true;
 			try {
 				this.token = firstPage();
-			} catch (final DnetCollectorException e) {
+			} catch (final CollectorException e) {
 				throw new RuntimeException(e);
 			}
 		}
@ -80,7 +89,7 @@ public class OaiIterator implements Iterator<String> {
 			while (queue.isEmpty() && token != null && !token.isEmpty()) {
 				try {
 					token = otherPages(token);
-				} catch (final DnetCollectorException e) {
+				} catch (final CollectorException e) {
 					throw new RuntimeException(e);
 				}
 			}
@ -92,7 +101,7 @@ public class OaiIterator implements Iterator<String> {
 	public void remove() {
 	}

-	private String firstPage() throws DnetCollectorException {
+	private String firstPage() throws CollectorException {
 		try {
 			String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8");
 			if (set != null && !set.isEmpty()) {
@ -108,7 +117,8 @@ public class OaiIterator implements Iterator<String> {

 			return downloadPage(url);
 		} catch (final UnsupportedEncodingException e) {
-			throw new DnetCollectorException(e);
+			report.put(e.getClass().getName(), e.getMessage());
+			throw new CollectorException(e);
 		}
 	}

@ -126,32 +136,35 @@ public class OaiIterator implements Iterator<String> {
 		return result.trim();
 	}

-	private String otherPages(final String resumptionToken) throws DnetCollectorException {
+	private String otherPages(final String resumptionToken) throws CollectorException {
 		try {
 			return downloadPage(
 				baseUrl
 					+ "?verb=ListRecords&resumptionToken="
 					+ URLEncoder.encode(resumptionToken, "UTF-8"));
 		} catch (final UnsupportedEncodingException e) {
-			throw new DnetCollectorException(e);
+			report.put(e.getClass().getName(), e.getMessage());
+			throw new CollectorException(e);
 		}
 	}

-	private String downloadPage(final String url) throws DnetCollectorException {
+	private String downloadPage(final String url) throws CollectorException {

-		final String xml = httpConnector.getInputSource(url);
+		final String xml = httpConnector.getInputSource(url, report);
 		Document doc;
 		try {
-			doc = reader.read(new StringReader(xml));
+			doc = DocumentHelper.parseText(xml);
 		} catch (final DocumentException e) {
-			log.warn("Error parsing xml, I try to clean it: " + xml, e);
+			log.warn("Error parsing xml, I try to clean it. {}", e.getMessage());
+			report.put(e.getClass().getName(), e.getMessage());
 			final String cleaned = XmlCleaner.cleanAllEntities(xml);
 			try {
-				doc = reader.read(new StringReader(cleaned));
+				doc = DocumentHelper.parseText(xml);
 			} catch (final DocumentException e1) {
 				final String resumptionToken = extractResumptionToken(xml);
 				if (resumptionToken == null) {
-					throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1);
+					report.put(e1.getClass().getName(), e1.getMessage());
+					throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1);
 				}
 				return resumptionToken;
 			}
@ -159,19 +172,35 @@ public class OaiIterator implements Iterator<String> {

 		final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
 		if (errorNode != null) {
-			final String code = errorNode.valueOf("@code");
-			if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
-				log.warn("noRecordsMatch for oai call: " + url);
+			final String code = errorNode.valueOf("@code").trim();
+			if ("noRecordsMatch".equalsIgnoreCase(code)) {
+				final String msg = "noRecordsMatch for oai call : " + url;
+				log.warn(msg);
+				report.put(REPORT_PREFIX + code, msg);
 				return null;
 			} else {
-				throw new DnetCollectorException(code + " - " + errorNode.getText());
+				final String msg = code + " - " + errorNode.getText();
+				report.put(REPORT_PREFIX + "error", msg);
+				throw new CollectorException(msg);
 			}
 		}

 		for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
-			queue.add(((Node) o).asXML());
+			final StringWriter sw = new StringWriter();
+			final XMLWriter writer = new XMLWriter(sw, OutputFormat.createPrettyPrint());
+			try {
+				writer.write((Node) o);
+				queue.add(sw.toString());
+			} catch (IOException e) {
+				report.put(e.getClass().getName(), e.getMessage());
+				throw new CollectorException("Error parsing XML record:\n" + ((Node) o).asXML(), e);
+			}
 		}

 		return doc.valueOf("//*[local-name()='resumptionToken']");
 	}
+
+	public AggregatorReport getReport() {
+		return report;
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java
@ -3,24 +3,28 @@ package eu.dnetlib.dhp.collection.plugin.oai;

 import java.util.Iterator;

-import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.collection.HttpClientParams;
+import eu.dnetlib.dhp.collection.HttpConnector2;

 public class OaiIteratorFactory {

-	private HttpConnector httpConnector;
+	private HttpConnector2 httpConnector;

 	public Iterator<String> newIterator(
 		final String baseUrl,
 		final String mdFormat,
 		final String set,
 		final String fromDate,
-		final String untilDate) {
-		return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector());
+		final String untilDate,
+		final HttpClientParams clientParams,
+		final AggregatorReport report) {
+		return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), report);
 	}

-	private HttpConnector getHttpConnector() {
+	private HttpConnector2 getHttpConnector(HttpClientParams clientParams) {
 		if (httpConnector == null)
-			httpConnector = new HttpConnector();
+			httpConnector = new HttpConnector2(clientParams);
 		return httpConnector;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@ -0,0 +1,105 @@
+
+package eu.dnetlib.dhp.collection.plugin.rest;
+
+import java.util.Optional;
+import java.util.Spliterator;
+import java.util.Spliterators;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import org.apache.commons.lang3.StringUtils;
+
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
+import eu.dnetlib.dhp.collection.ApiDescriptor;
+import eu.dnetlib.dhp.collection.CollectorException;
+import eu.dnetlib.dhp.collection.HttpClientParams;
+import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
+
+/**
+ * TODO: delegate HTTP requests to the common HttpConnector2 implementation.
+ *
+ * @author 	js, Andreas Czerniak
+ * @date 	2020-04-09
+ *
+ */
+public class RestCollectorPlugin implements CollectorPlugin {
+
+	public static final String RESULT_SIZE_VALUE_DEFAULT = "100";
+
+	private HttpClientParams clientParams;
+
+	public RestCollectorPlugin(HttpClientParams clientParams) {
+		this.clientParams = clientParams;
+	}
+
+	@Override
+	public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
+		final String baseUrl = api.getBaseUrl();
+
+		final String resumptionType = api.getParams().get("resumptionType");
+		final String resumptionParam = api.getParams().get("resumptionParam");
+		final String resumptionXpath = api.getParams().get("resumptionXpath");
+		final String resultTotalXpath = api.getParams().get("resultTotalXpath");
+		final String resultFormatParam = api.getParams().get("resultFormatParam");
+		final String resultFormatValue = api.getParams().get("resultFormatValue");
+		final String resultSizeParam = api.getParams().get("resultSizeParam");
+		final String queryParams = api.getParams().get("queryParams");
+		final String entityXpath = api.getParams().get("entityXpath");
+		final String authMethod = api.getParams().get("authMethod");
+		final String authToken = api.getParams().get("authToken");
+		final String resultSizeValue = Optional
+			.ofNullable(api.getParams().get("resultSizeValue"))
+			.filter(StringUtils::isNotBlank)
+			.orElse(RESULT_SIZE_VALUE_DEFAULT);
+
+		if (StringUtils.isBlank(baseUrl)) {
+			throw new CollectorException("Param 'baseUrl' is null or empty");
+		}
+		if (StringUtils.isBlank(resumptionType)) {
+			throw new CollectorException("Param 'resumptionType' is null or empty");
+		}
+		if (StringUtils.isBlank(resumptionParam)) {
+			throw new CollectorException("Param 'resumptionParam' is null or empty");
+		}
+		if (StringUtils.isBlank(resultFormatValue)) {
+			throw new CollectorException("Param 'resultFormatValue' is null or empty");
+		}
+		if (StringUtils.isBlank(queryParams)) {
+			throw new CollectorException("Param 'queryParams' is null or empty");
+		}
+		if (StringUtils.isBlank(entityXpath)) {
+			throw new CollectorException("Param 'entityXpath' is null or empty");
+		}
+
+		final String resultOutputFormat = Optional
+			.ofNullable(api.getParams().get("resultOutputFormat"))
+			.map(String::toLowerCase)
+			.filter(StringUtils::isNotBlank)
+			.orElse(resultFormatValue.toLowerCase());
+
+		RestIterator it = new RestIterator(
+			getClientParams(),
+			baseUrl,
+			resumptionType,
+			resumptionParam,
+			resumptionXpath,
+			resultTotalXpath,
+			resultFormatParam,
+			resultFormatValue,
+			resultSizeParam,
+			resultSizeValue,
+			queryParams,
+			entityXpath,
+			authMethod,
+			authToken,
+			resultOutputFormat);
+
+		return StreamSupport
+			.stream(
+				Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
+	}
+
+	public HttpClientParams getClientParams() {
+		return clientParams;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@ -0,0 +1,412 @@
+
+package eu.dnetlib.dhp.collection.plugin.rest;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.Iterator;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import javax.xml.xpath.*;
+
+import org.apache.avro.test.http.Http;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.http.HttpHeaders;
+import org.apache.http.entity.ContentType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+import eu.dnetlib.dhp.collection.CollectorException;
+import eu.dnetlib.dhp.collection.HttpClientParams;
+import eu.dnetlib.dhp.collection.JsonUtils;
+
+/**
+ * log.info(...) equal to  log.trace(...) in the application-logs
+ * <p>
+ * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
+ *
+ * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
+ * @date 2020-04-09
+ *
+ */
+public class RestIterator implements Iterator<String> {
+
+	private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
+	public static final String UTF_8 = "UTF-8";
+
+	private HttpClientParams clientParams;
+
+	private final String BASIC = "basic";
+
+	private JsonUtils jsonUtils;
+
+	private String baseUrl;
+	private String resumptionType;
+	private String resumptionParam;
+	private String resultFormatValue;
+	private String queryParams;
+	private int resultSizeValue;
+	private int resumptionInt = 0; // integer resumption token (first record to harvest)
+	private int resultTotal = -1;
+	private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
+																	// or token scanned from results)
+	private InputStream resultStream;
+	private Transformer transformer;
+	private XPath xpath;
+	private String query;
+	private XPathExpression xprResultTotalPath;
+	private XPathExpression xprResumptionPath;
+	private XPathExpression xprEntity;
+	private String queryFormat;
+	private String querySize;
+	private String authMethod;
+	private String authToken;
+	private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
+	private int discoverResultSize = 0;
+	private int pagination = 1;
+	/*
+	 * While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in
+	 * json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in
+	 * json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format.
+	 */
+	private String resultOutputFormat;
+
+	/** RestIterator class
+	 *  compatible to version 1.3.33
+	 */
+	public RestIterator(
+		final HttpClientParams clientParams,
+		final String baseUrl,
+		final String resumptionType,
+		final String resumptionParam,
+		final String resumptionXpath,
+		final String resultTotalXpath,
+		final String resultFormatParam,
+		final String resultFormatValue,
+		final String resultSizeParam,
+		final String resultSizeValueStr,
+		final String queryParams,
+		final String entityXpath,
+		final String authMethod,
+		final String authToken,
+		final String resultOutputFormat) {
+
+		this.clientParams = clientParams;
+		this.jsonUtils = new JsonUtils();
+		this.baseUrl = baseUrl;
+		this.resumptionType = resumptionType;
+		this.resumptionParam = resumptionParam;
+		this.resultFormatValue = resultFormatValue;
+		this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
+		this.queryParams = queryParams;
+		this.authMethod = authMethod;
+		this.authToken = authToken;
+		this.resultOutputFormat = resultOutputFormat;
+
+		queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
+			: "";
+		querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
+
+		try {
+			initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
+		} catch (Exception e) {
+			throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
+		}
+		initQueue();
+	}
+
+	private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
+		throws TransformerConfigurationException, XPathExpressionException {
+		transformer = TransformerFactory.newInstance().newTransformer();
+		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+		transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
+		xpath = XPathFactory.newInstance().newXPath();
+		xprResultTotalPath = xpath.compile(resultTotalXpath);
+		xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
+		xprEntity = xpath.compile(entityXpath);
+	}
+
+	private void initQueue() {
+		query = baseUrl + "?" + queryParams + querySize + queryFormat;
+		log.info("REST calls starting with " + query);
+	}
+
+	private void disconnect() {
+		// TODO close inputstream
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * @see java.util.Iterator#hasNext()
+	 */
+	@Override
+	public boolean hasNext() {
+		if (recordQueue.isEmpty() && query.isEmpty()) {
+			disconnect();
+			return false;
+		} else {
+			return true;
+		}
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * @see java.util.Iterator#next()
+	 */
+	@Override
+	public String next() {
+		synchronized (recordQueue) {
+			while (recordQueue.isEmpty() && !query.isEmpty()) {
+				try {
+					query = downloadPage(query);
+				} catch (CollectorException e) {
+					log.debug("CollectorPlugin.next()-Exception: " + e);
+					throw new RuntimeException(e);
+				}
+			}
+			return recordQueue.poll();
+		}
+	}
+
+	/*
+	 * download page and return nextQuery
+	 */
+	private String downloadPage(String query) throws CollectorException {
+		String resultJson;
+		String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+		String nextQuery = "";
+		String emptyXml = resultXml + "<" + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">";
+		Node resultNode = null;
+		NodeList nodeList = null;
+		String qUrlArgument = "";
+		int urlOldResumptionSize = 0;
+		InputStream theHttpInputStream;
+
+		// check if cursor=* is initial set otherwise add it to the queryParam URL
+		if (resumptionType.equalsIgnoreCase("deep-cursor")) {
+			log.debug("check resumptionType deep-cursor and check cursor=*?" + query);
+			if (!query.contains("&cursor=")) {
+				query += "&cursor=*";
+			}
+		}
+
+		try {
+			log.info("requestig URL [{}]", query);
+
+			URL qUrl = new URL(query);
+			log.debug("authMethod :" + authMethod);
+			if ("bearer".equalsIgnoreCase(this.authMethod)) {
+				log.trace("authMethod before inputStream: " + resultXml);
+				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
+				conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
+				conn.setRequestMethod("GET");
+				theHttpInputStream = conn.getInputStream();
+			} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
+				log.trace("authMethod before inputStream: " + resultXml);
+				HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+				conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
+				conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
+				conn.setRequestMethod("GET");
+				theHttpInputStream = conn.getInputStream();
+			} else {
+				theHttpInputStream = qUrl.openStream();
+			}
+
+			resultStream = theHttpInputStream;
+			if ("json".equals(resultOutputFormat)) {
+				resultJson = IOUtils.toString(resultStream, UTF_8);
+				resultXml = jsonUtils.convertToXML(resultJson);
+				resultStream = IOUtils.toInputStream(resultXml, UTF_8);
+			}
+
+			if (!(emptyXml).equalsIgnoreCase(resultXml)) {
+				resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
+				nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
+				log.debug("nodeList.length: " + nodeList.getLength());
+				for (int i = 0; i < nodeList.getLength(); i++) {
+					StringWriter sw = new StringWriter();
+					transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
+					String toEnqueue = sw.toString();
+					if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
+						log.warn("The following record resulted in empty item for the feeding queue: " + resultXml);
+					} else {
+						recordQueue.add(sw.toString());
+					}
+				}
+			} else {
+				log.warn("resultXml is equal with emptyXml");
+			}
+
+			resumptionInt += resultSizeValue;
+
+			switch (resumptionType.toLowerCase()) {
+				case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
+					resumptionStr = xprResumptionPath.evaluate(resultNode);
+					break;
+
+				case "count": // begin at one step for all records, iterate over items
+					resumptionStr = Integer.toString(resumptionInt);
+					break;
+
+				case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
+					if (resultSizeValue < 2) {
+						throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
+					}
+					qUrlArgument = qUrl.getQuery();
+					String[] arrayQUrlArgument = qUrlArgument.split("&");
+					for (String arrayUrlArgStr : arrayQUrlArgument) {
+						if (arrayUrlArgStr.startsWith(resumptionParam)) {
+							String[] resumptionKeyValue = arrayUrlArgStr.split("=");
+							if (isInteger(resumptionKeyValue[1])) {
+								urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
+								log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize);
+							} else {
+								log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]);
+							}
+						}
+					}
+
+					if (((emptyXml).equalsIgnoreCase(resultXml))
+						|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
+						// resumptionStr = "";
+						if (nodeList != null) {
+							discoverResultSize += nodeList.getLength();
+						}
+						resultTotal = discoverResultSize;
+					} else {
+						resumptionStr = Integer.toString(resumptionInt);
+						resultTotal = resumptionInt + 1;
+						if (nodeList != null) {
+							discoverResultSize += nodeList.getLength();
+						}
+					}
+					log.info("discoverResultSize:  {}", discoverResultSize);
+					break;
+
+				case "pagination":
+				case "page": // pagination, iterate over page numbers
+					pagination += 1;
+					if (nodeList != null) {
+						discoverResultSize += nodeList.getLength();
+					} else {
+						resultTotal = discoverResultSize;
+						pagination = discoverResultSize;
+					}
+					resumptionInt = pagination;
+					resumptionStr = Integer.toString(resumptionInt);
+					break;
+
+				case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
+									// solr)
+					// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
+					// deep-cursor, Param 'resultSizeValue' is less than 2");}
+
+					resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
+					queryParams = queryParams.replace("&cursor=*", "");
+
+					// terminating if length of nodeList is 0
+					if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
+						resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
+					} else {
+						resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
+																					// because the iteration is over
+																					// real length and the
+																					// resultSizeValue is added before
+																					// the switch()
+					}
+
+					discoverResultSize = nodeList.getLength();
+
+					log
+						.debug(
+							"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
+								+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
+
+					break;
+
+				default: // otherwise: abort
+					// resultTotal = resumptionInt;
+					break;
+			}
+
+		} catch (Exception e) {
+			log.error(e.getMessage(), e);
+			throw new IllegalStateException("collection failed: " + e.getMessage());
+		}
+
+		try {
+			if (resultTotal == -1) {
+				resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
+				if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
+					resultTotal += 1;
+				} // to correct the upper bound
+				log.info("resultTotal was -1 is now: " + resultTotal);
+			}
+		} catch (Exception e) {
+			log.error(e.getMessage(), e);
+			throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
+		}
+		log.debug("resultTotal: " + resultTotal);
+		log.debug("resInt: " + resumptionInt);
+		if (resumptionInt <= resultTotal) {
+			nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
+				+ queryFormat;
+		} else {
+			nextQuery = "";
+			// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
+			// resumptionInt and prevent a NullPointer Exception at mdStore
+		}
+		log.debug("nextQueryUrl: " + nextQuery);
+		return nextQuery;
+
+	}
+
+	private boolean isInteger(String s) {
+		boolean isValidInteger = false;
+		try {
+			Integer.parseInt(s);
+
+			// s is a valid integer
+
+			isValidInteger = true;
+		} catch (NumberFormatException ex) {
+			// s is not an integer
+		}
+
+		return isValidInteger;
+	}
+
+	// Method to encode a string value using `UTF-8` encoding scheme
+	private String encodeValue(String value) {
+		try {
+			return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
+		} catch (UnsupportedEncodingException ex) {
+			throw new RuntimeException(ex.getCause());
+		}
+	}
+
+	public String getResultFormatValue() {
+		return resultFormatValue;
+	}
+
+	public String getResultOutputFormat() {
+		return resultOutputFormat;
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java
@ -1,139 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-import eu.dnetlib.collector.worker.model.ApiDescriptor;
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
-import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
-import eu.dnetlib.message.Message;
-import eu.dnetlib.message.MessageManager;
-import eu.dnetlib.message.MessageType;
-
-public class DnetCollectorWorker {
-
-	private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class);
-
-	private final CollectorPluginFactory collectorPluginFactory;
-
-	private final ArgumentApplicationParser argumentParser;
-
-	private final MessageManager manager;
-
-	public DnetCollectorWorker(
-		final CollectorPluginFactory collectorPluginFactory,
-		final ArgumentApplicationParser argumentParser,
-		final MessageManager manager)
-		throws DnetCollectorException {
-		this.collectorPluginFactory = collectorPluginFactory;
-		this.argumentParser = argumentParser;
-		this.manager = manager;
-	}
-
-	public void collect() throws DnetCollectorException {
-		try {
-			final ObjectMapper jsonMapper = new ObjectMapper();
-			final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class);
-
-			final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol());
-
-			final String hdfsuri = argumentParser.get("namenode");
-
-			// ====== Init HDFS File System Object
-			Configuration conf = new Configuration();
-			// Set FileSystem URI
-			conf.set("fs.defaultFS", hdfsuri);
-			// Because of Maven
-			conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
-			conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
-
-			System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS"));
-			System.setProperty("hadoop.home.dir", "/");
-			// Get the filesystem - HDFS
-			FileSystem.get(URI.create(hdfsuri), conf);
-			Path hdfswritepath = new Path(argumentParser.get("hdfsPath"));
-
-			log.info("Created path " + hdfswritepath.toString());
-
-			final Map<String, String> ongoingMap = new HashMap<>();
-			final Map<String, String> reportMap = new HashMap<>();
-			final AtomicInteger counter = new AtomicInteger(0);
-			try (SequenceFile.Writer writer = SequenceFile
-				.createWriter(
-					conf,
-					SequenceFile.Writer.file(hdfswritepath),
-					SequenceFile.Writer.keyClass(IntWritable.class),
-					SequenceFile.Writer.valueClass(Text.class))) {
-				final IntWritable key = new IntWritable(counter.get());
-				final Text value = new Text();
-				plugin
-					.collect(api)
-					.forEach(
-						content -> {
-							key.set(counter.getAndIncrement());
-							value.set(content);
-							if (counter.get() % 10 == 0) {
-								try {
-									ongoingMap.put("ongoing", "" + counter.get());
-									log
-										.debug(
-											"Sending message: "
-												+ manager
-													.sendMessage(
-														new Message(
-															argumentParser.get("workflowId"),
-															"Collection",
-															MessageType.ONGOING,
-															ongoingMap),
-														argumentParser.get("rabbitOngoingQueue"),
-														true,
-														false));
-								} catch (Exception e) {
-									log.error("Error on sending message ", e);
-								}
-							}
-							try {
-								writer.append(key, value);
-							} catch (IOException e) {
-								throw new RuntimeException(e);
-							}
-						});
-			}
-			ongoingMap.put("ongoing", "" + counter.get());
-			manager
-				.sendMessage(
-					new Message(
-						argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap),
-					argumentParser.get("rabbitOngoingQueue"),
-					true,
-					false);
-			reportMap.put("collected", "" + counter.get());
-			manager
-				.sendMessage(
-					new Message(
-						argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
-					argumentParser.get("rabbitOngoingQueue"),
-					true,
-					false);
-			manager.close();
-		} catch (Throwable e) {
-			throw new DnetCollectorException("Error on collecting ", e);
-		}
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java
@ -1,49 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker;
-
-import org.apache.commons.io.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
-import eu.dnetlib.message.MessageManager;
-
-/**
- * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
- * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector
- * plugin to use and where store the data into HDFS path
- *
- * @author Sandro La Bruzzo
- */
-public class DnetCollectorWorkerApplication {
-
-	private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class);
-
-	private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
-
-	private static ArgumentApplicationParser argumentParser;
-
-	/** @param args */
-	public static void main(final String[] args) throws Exception {
-
-		argumentParser = new ArgumentApplicationParser(
-			IOUtils
-				.toString(
-					DnetCollectorWorker.class
-						.getResourceAsStream(
-							"/eu/dnetlib/collector/worker/collector_parameter.json")));
-		argumentParser.parseArgument(args);
-		log.info("hdfsPath =" + argumentParser.get("hdfsPath"));
-		log.info("json = " + argumentParser.get("apidescriptor"));
-		final MessageManager manager = new MessageManager(
-			argumentParser.get("rabbitHost"),
-			argumentParser.get("rabbitUser"),
-			argumentParser.get("rabbitPassword"),
-			false,
-			false,
-			null);
-		final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager);
-		worker.collect();
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java
@ -1,19 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker.utils;
-
-import java.util.LinkedList;
-
-public class CollectorPluginErrorLogList extends LinkedList<String> {
-
-	private static final long serialVersionUID = -6925786561303289704L;
-
-	@Override
-	public String toString() {
-		String log = "";
-		int index = 0;
-		for (final String errorMessage : this) {
-			log += String.format("Retry #%s: %s / ", index++, errorMessage);
-		}
-		return log;
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java
@ -1,20 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker.utils;
-
-import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
-import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
-import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
-
-public class CollectorPluginFactory {
-
-	public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException {
-		if (protocol == null)
-			throw new DnetCollectorException("protocol cannot be null");
-		switch (protocol.toLowerCase().trim()) {
-			case "oai":
-				return new OaiCollectorPlugin();
-			default:
-				throw new DnetCollectorException("UNknown protocol");
-		}
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java
@ -1,244 +0,0 @@
-
-package eu.dnetlib.dhp.collection.worker.utils;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.*;
-import java.security.GeneralSecurityException;
-import java.security.cert.X509Certificate;
-import java.util.List;
-import java.util.Map;
-
-import javax.net.ssl.HttpsURLConnection;
-import javax.net.ssl.SSLContext;
-import javax.net.ssl.TrustManager;
-import javax.net.ssl.X509TrustManager;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang.math.NumberUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
-
-public class HttpConnector {
-
-	private static final Log log = LogFactory.getLog(HttpConnector.class);
-
-	private int maxNumberOfRetry = 6;
-	private int defaultDelay = 120; // seconds
-	private int readTimeOut = 120; // seconds
-
-	private String responseType = null;
-
-	private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
-
-	public HttpConnector() {
-		CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
-	}
-
-	/**
-	 * Given the URL returns the content via HTTP GET
-	 *
-	 * @param requestUrl the URL
-	 * @return the content of the downloaded resource
-	 * @throws DnetCollectorException when retrying more than maxNumberOfRetry times
-	 */
-	public String getInputSource(final String requestUrl) throws DnetCollectorException {
-		return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
-	}
-
-	/**
-	 * Given the URL returns the content as a stream via HTTP GET
-	 *
-	 * @param requestUrl the URL
-	 * @return the content of the downloaded resource as InputStream
-	 * @throws DnetCollectorException when retrying more than maxNumberOfRetry times
-	 */
-	public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException {
-		return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
-	}
-
-	private String attemptDownlaodAsString(
-		final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
-		throws DnetCollectorException {
-		try {
-			final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
-			try {
-				return IOUtils.toString(s);
-			} catch (final IOException e) {
-				log.error("error while retrieving from http-connection occured: " + requestUrl, e);
-				Thread.sleep(defaultDelay * 1000);
-				errorList.add(e.getMessage());
-				return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
-			} finally {
-				IOUtils.closeQuietly(s);
-			}
-		} catch (final InterruptedException e) {
-			throw new DnetCollectorException(e);
-		}
-	}
-
-	private InputStream attemptDownload(
-		final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
-		throws DnetCollectorException {
-
-		if (retryNumber > maxNumberOfRetry) {
-			throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList);
-		}
-
-		log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
-		try {
-			InputStream input = null;
-
-			try {
-				final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
-				urlConn.setInstanceFollowRedirects(false);
-				urlConn.setReadTimeout(readTimeOut * 1000);
-				urlConn.addRequestProperty("User-Agent", userAgent);
-
-				if (log.isDebugEnabled()) {
-					logHeaderFields(urlConn);
-				}
-
-				final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
-				if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
-					log.warn("waiting and repeating request after " + retryAfter + " sec.");
-					Thread.sleep(retryAfter * 1000);
-					errorList.add("503 Service Unavailable");
-					urlConn.disconnect();
-					return attemptDownload(requestUrl, retryNumber + 1, errorList);
-				} else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM
-					|| urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
-					final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
-					log.debug("The requested url has been moved to " + newUrl);
-					errorList
-						.add(
-							String
-								.format(
-									"%s %s. Moved to: %s",
-									urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
-					urlConn.disconnect();
-					return attemptDownload(newUrl, retryNumber + 1, errorList);
-				} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
-					log
-						.error(
-							String
-								.format(
-									"HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
-					Thread.sleep(defaultDelay * 1000);
-					errorList
-						.add(
-							String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
-					urlConn.disconnect();
-					return attemptDownload(requestUrl, retryNumber + 1, errorList);
-				} else {
-					input = urlConn.getInputStream();
-					responseType = urlConn.getContentType();
-					return input;
-				}
-			} catch (final IOException e) {
-				log.error("error while retrieving from http-connection occured: " + requestUrl, e);
-				Thread.sleep(defaultDelay * 1000);
-				errorList.add(e.getMessage());
-				return attemptDownload(requestUrl, retryNumber + 1, errorList);
-			}
-		} catch (final InterruptedException e) {
-			throw new DnetCollectorException(e);
-		}
-	}
-
-	private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
-		log.debug("StatusCode: " + urlConn.getResponseMessage());
-
-		for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
-			if (e.getKey() != null) {
-				for (final String v : e.getValue()) {
-					log.debug("  key: " + e.getKey() + " - value: " + v);
-				}
-			}
-		}
-	}
-
-	private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
-		for (final String key : headerMap.keySet()) {
-			if (key != null
-				&& key.toLowerCase().equals("retry-after")
-				&& headerMap.get(key).size() > 0
-				&& NumberUtils.isNumber(headerMap.get(key).get(0))) {
-				return Integer.parseInt(headerMap.get(key).get(0)) + 10;
-			}
-		}
-		return -1;
-	}
-
-	private String obtainNewLocation(final Map<String, List<String>> headerMap)
-		throws DnetCollectorException {
-		for (final String key : headerMap.keySet()) {
-			if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) {
-				return headerMap.get(key).get(0);
-			}
-		}
-		throw new DnetCollectorException(
-			"The requested url has been MOVED, but 'location' param is MISSING");
-	}
-
-	/**
-	 * register for https scheme; this is a workaround and not intended for the use in trusted environments
-	 */
-	public void initTrustManager() {
-		final X509TrustManager tm = new X509TrustManager() {
-
-			@Override
-			public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
-			}
-
-			@Override
-			public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
-			}
-
-			@Override
-			public X509Certificate[] getAcceptedIssuers() {
-				return null;
-			}
-		};
-		try {
-			final SSLContext ctx = SSLContext.getInstance("TLS");
-			ctx.init(null, new TrustManager[] {
-				tm
-			}, null);
-			HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
-		} catch (final GeneralSecurityException e) {
-			log.fatal(e);
-			throw new IllegalStateException(e);
-		}
-	}
-
-	public int getMaxNumberOfRetry() {
-		return maxNumberOfRetry;
-	}
-
-	public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
-		this.maxNumberOfRetry = maxNumberOfRetry;
-	}
-
-	public int getDefaultDelay() {
-		return defaultDelay;
-	}
-
-	public void setDefaultDelay(final int defaultDelay) {
-		this.defaultDelay = defaultDelay;
-	}
-
-	public int getReadTimeOut() {
-		return readTimeOut;
-	}
-
-	public void setReadTimeOut(final int readTimeOut) {
-		this.readTimeOut = readTimeOut;
-	}
-
-	public String getResponseType() {
-		return responseType;
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java
@ -0,0 +1,29 @@
+
+package eu.dnetlib.dhp.transformation;
+
+public class DnetTransformationException extends Exception {
+
+	public DnetTransformationException() {
+		super();
+	}
+
+	public DnetTransformationException(
+		final String message,
+		final Throwable cause,
+		final boolean enableSuppression,
+		final boolean writableStackTrace) {
+		super(message, cause, enableSuppression, writableStackTrace);
+	}
+
+	public DnetTransformationException(final String message, final Throwable cause) {
+		super(message, cause);
+	}
+
+	public DnetTransformationException(final String message) {
+		super(message);
+	}
+
+	public DnetTransformationException(final Throwable cause) {
+		super(cause);
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java
@ -1,15 +1,14 @@

 package eu.dnetlib.dhp.transformation;

+import static eu.dnetlib.dhp.common.Constants.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+import static eu.dnetlib.dhp.utils.DHPUtils.*;

-import java.io.ByteArrayInputStream;
-import java.util.HashMap;
+import java.io.IOException;
 import java.util.Map;
-import java.util.Objects;
 import java.util.Optional;

-import org.apache.commons.cli.*;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.Dataset;
@ -17,22 +16,18 @@ import org.apache.spark.sql.Encoder;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.util.LongAccumulator;
-import org.dom4j.Document;
-import org.dom4j.DocumentException;
-import org.dom4j.Node;
-import org.dom4j.io.SAXReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
+import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
+import eu.dnetlib.dhp.aggregation.common.AggregatorReport;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob;
-import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
-import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
-import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
-import eu.dnetlib.dhp.utils.DHPUtils;
-import eu.dnetlib.message.Message;
-import eu.dnetlib.message.MessageManager;
-import eu.dnetlib.message.MessageType;
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.message.MessageSender;
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

 public class TransformSparkJobNode {

@ -55,67 +50,85 @@ public class TransformSparkJobNode {
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

-		final String inputPath = parser.get("input");
-		final String outputPath = parser.get("output");
-		final String workflowId = parser.get("workflowId");
-		final String trasformationRule = extractXSLTFromTR(
-			Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule"))));
+		final String mdstoreInputVersion = parser.get("mdstoreInputVersion");
+		final String mdstoreOutputVersion = parser.get("mdstoreOutputVersion");

-		final String rabbitUser = parser.get("rabbitUser");
-		final String rabbitPassword = parser.get("rabbitPassword");
-		final String rabbitHost = parser.get("rabbitHost");
-		final String rabbitReportQueue = parser.get("rabbitReportQueue");
-		final long dateOfCollection = new Long(parser.get("dateOfCollection"));
-		final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest"));
+		final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class);
+		final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH;
+		log.info("inputPath: {}", inputPath);
+
+		final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class);
+		final String outputBasePath = cleanedMdStoreVersion.getHdfsPath();
+		log.info("outputBasePath: {}", outputBasePath);
+
+		final String isLookupUrl = parser.get("isLookupUrl");
+		log.info(String.format("isLookupUrl: %s", isLookupUrl));
+
+		final String dateOfTransformation = parser.get("dateOfTransformation");
+		log.info(String.format("dateOfTransformation: %s", dateOfTransformation));
+
+		final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
+
+		final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
+
+		log.info("Retrieved {} vocabularies", vocabularies.vocabularyNames().size());

 		SparkConf conf = new SparkConf();
 		runWithSparkSession(
 			conf,
 			isSparkSessionManaged,
 			spark -> {
-				final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
-				final Dataset<MetadataRecord> mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder);
-				final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems");
-				final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems");
-				final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems");
-				final Map<String, Vocabulary> vocabularies = new HashMap<>();
-				vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages"));
-				final TransformFunction transformFunction = new TransformFunction(
-					totalItems,
-					errorItems,
-					transformedItems,
-					trasformationRule,
-					dateOfCollection,
-					vocabularies);
-				mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath);
-				if (rabbitHost != null) {
-					System.out.println("SEND FINAL REPORT");
-					final Map<String, String> reportMap = new HashMap<>();
-					reportMap.put("inputItem", "" + totalItems.value());
-					reportMap.put("invalidRecords", "" + errorItems.value());
-					reportMap.put("mdStoreSize", "" + transformedItems.value());
-					System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap));
-					if (!test) {
-						final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false,
-							false,
-							null);
-						manager
-							.sendMessage(
-								new Message(workflowId, "Transform", MessageType.REPORT, reportMap),
-								rabbitReportQueue,
-								true,
-								false);
-						manager.close();
-					}
-				}
+				transformRecords(
+					parser.getObjectMap(), isLookupService, spark, inputPath, outputBasePath);
 			});
-
 	}

-	private static String extractXSLTFromTR(final String tr) throws DocumentException {
-		SAXReader reader = new SAXReader();
-		Document document = reader.read(new ByteArrayInputStream(tr.getBytes()));
-		Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']");
-		return node.asXML();
+	public static void transformRecords(final Map<String, String> args, final ISLookUpService isLookUpService,
+		final SparkSession spark, final String inputPath, final String outputBasePath)
+		throws DnetTransformationException, IOException {
+
+		final LongAccumulator totalItems = spark.sparkContext().longAccumulator(CONTENT_TOTALITEMS);
+		final LongAccumulator errorItems = spark.sparkContext().longAccumulator(CONTENT_INVALIDRECORDS);
+		final LongAccumulator transformedItems = spark.sparkContext().longAccumulator(CONTENT_TRANSFORMEDRECORDS);
+		final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems);
+		final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
+
+		final String dnetMessageManagerURL = args.get(DNET_MESSAGE_MGR_URL);
+		log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL);
+
+		final String workflowId = args.get("workflowId");
+		log.info("workflowId is {}", workflowId);
+
+		final MessageSender messageSender = new MessageSender(dnetMessageManagerURL, workflowId);
+		try (AggregatorReport report = new AggregatorReport(messageSender)) {
+			try {
+				final Dataset<MetadataRecord> mdstore = spark
+					.read()
+					.format("parquet")
+					.load(inputPath)
+					.as(encoder)
+					.map(
+						TransformationFactory.getTransformationPlugin(args, ct, isLookUpService),
+						encoder);
+				saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH);
+
+				log.info("Transformed item " + ct.getProcessedItems().count());
+				log.info("Total item " + ct.getTotalItems().count());
+				log.info("Transformation Error item " + ct.getErrorItems().count());
+
+				final long mdStoreSize = spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count();
+				writeHdfsFile(
+					spark.sparkContext().hadoopConfiguration(),
+					"" + mdStoreSize, outputBasePath + MDSTORE_SIZE_PATH);
+			} catch (Throwable e) {
+				log.error("error during record transformation", e);
+				report.put(TransformSparkJobNode.class.getSimpleName(), e.getMessage());
+				report.put(CONTENT_TOTALITEMS, ct.getTotalItems().value().toString());
+				report.put(CONTENT_INVALIDRECORDS, ct.getErrorItems().value().toString());
+				report.put(CONTENT_TRANSFORMEDRECORDS, ct.getProcessedItems().value().toString());
+				throw e;
+			}
+		}
 	}
+
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java
@ -0,0 +1,69 @@
+
+package eu.dnetlib.dhp.transformation;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.api.java.function.MapFunction;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
+import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+public class TransformationFactory {
+
+	private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class);
+	public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/*[local-name() =\"stylesheet\"]";
+
+	public static MapFunction<MetadataRecord, MetadataRecord> getTransformationPlugin(
+		final Map<String, String> jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService)
+		throws DnetTransformationException {
+
+		try {
+			final String transformationPlugin = jobArgument.get("transformationPlugin");
+
+			log.info("Transformation plugin required " + transformationPlugin);
+			switch (transformationPlugin) {
+				case "XSLT_TRANSFORM": {
+					final String transformationRuleId = jobArgument.get("transformationRuleId");
+					if (StringUtils.isBlank(transformationRuleId))
+						throw new DnetTransformationException("Missing Parameter transformationRule");
+					final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
+
+					final String transformationRule = queryTransformationRuleFromIS(
+						transformationRuleId, isLookupService);
+
+					final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation"));
+					return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation,
+						vocabularies);
+
+				}
+				default:
+					throw new DnetTransformationException(
+						"transformation plugin does not exists for " + transformationPlugin);
+
+			}
+
+		} catch (Throwable e) {
+			throw new DnetTransformationException(e);
+		}
+	}
+
+	private static String queryTransformationRuleFromIS(final String transformationRuleId,
+		final ISLookUpService isLookUpService) throws Exception {
+		final String query = String.format(TRULE_XQUERY, transformationRuleId);
+		System.out.println("asking query to IS: " + query);
+		List<String> result = isLookUpService.quickSearchProfile(query);
+
+		if (result == null || result.isEmpty())
+			throw new DnetTransformationException(
+				"Unable to find transformation rule with name: " + transformationRuleId);
+		return result.get(0);
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java
@ -1,53 +0,0 @@
-
-package eu.dnetlib.dhp.transformation.vocabulary;
-
-import java.io.Serializable;
-
-public class Term implements Serializable {
-
-	private String englishName;
-	private String nativeName;
-	private String encoding;
-	private String code;
-	private String synonyms;
-
-	public String getEnglishName() {
-		return englishName;
-	}
-
-	public void setEnglishName(String englishName) {
-		this.englishName = englishName;
-	}
-
-	public String getNativeName() {
-		return nativeName;
-	}
-
-	public void setNativeName(String nativeName) {
-		this.nativeName = nativeName;
-	}
-
-	public String getEncoding() {
-		return encoding;
-	}
-
-	public void setEncoding(String encoding) {
-		this.encoding = encoding;
-	}
-
-	public String getCode() {
-		return code;
-	}
-
-	public void setCode(String code) {
-		this.code = code;
-	}
-
-	public String getSynonyms() {
-		return synonyms;
-	}
-
-	public void setSynonyms(String synonyms) {
-		this.synonyms = synonyms;
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java
@ -1,54 +0,0 @@
-
-package eu.dnetlib.dhp.transformation.vocabulary;
-
-import java.io.Serializable;
-import java.util.List;
-
-public class Vocabulary implements Serializable {
-
-	private String id;
-	private String name;
-	private String description;
-	private String code;
-	private List<Term> terms;
-
-	public String getId() {
-		return id;
-	}
-
-	public void setId(String id) {
-		this.id = id;
-	}
-
-	public String getName() {
-		return name;
-	}
-
-	public void setName(String name) {
-		this.name = name;
-	}
-
-	public String getDescription() {
-		return description;
-	}
-
-	public void setDescription(String description) {
-		this.description = description;
-	}
-
-	public String getCode() {
-		return code;
-	}
-
-	public void setCode(String code) {
-		this.code = code;
-	}
-
-	public List<Term> getTerms() {
-		return terms;
-	}
-
-	public void setTerms(List<Term> terms) {
-		this.terms = terms;
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java
@ -1,24 +0,0 @@
-
-package eu.dnetlib.dhp.transformation.vocabulary;
-
-import java.io.Serializable;
-import java.net.URL;
-import java.nio.charset.Charset;
-
-import org.apache.commons.io.IOUtils;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-
-public class VocabularyHelper implements Serializable {
-
-	private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json";
-
-	public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception {
-		final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName));
-
-		final String response = IOUtils.toString(url, Charset.defaultCharset());
-		final ObjectMapper jsonMapper = new ObjectMapper();
-		final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class);
-		return vocabulary;
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java
@ -1,25 +1,25 @@

-package eu.dnetlib.dhp.transformation.functions;
+package eu.dnetlib.dhp.transformation.xslt;

-import java.util.Map;
-import java.util.Optional;
+import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;

-import eu.dnetlib.dhp.transformation.vocabulary.Term;
-import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
+import java.io.Serializable;
+
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import net.sf.saxon.s9api.*;
-import scala.Serializable;

 public class Cleaner implements ExtensionFunction, Serializable {

-	private final Map<String, Vocabulary> vocabularies;
+	private final VocabularyGroup vocabularies;

-	public Cleaner(Map<String, Vocabulary> vocabularies) {
+	public Cleaner(final VocabularyGroup vocabularies) {
 		this.vocabularies = vocabularies;
 	}

 	@Override
 	public QName getName() {
-		return new QName("http://eu/dnetlib/trasform/extension", "clean");
+		return new QName(QNAME_BASE_URI + "/clean", "clean");
 	}

 	@Override
@ -30,23 +30,22 @@ public class Cleaner implements ExtensionFunction, Serializable {
 	@Override
 	public SequenceType[] getArgumentTypes() {
 		return new SequenceType[] {
-			SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE),
+			SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE),
 			SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE)
 		};
 	}

 	@Override
 	public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
+		XdmValue r = xdmValues[0];
+		if (r.size() == 0) {
+			return new XdmAtomicValue("");
+		}
 		final String currentValue = xdmValues[0].itemAt(0).getStringValue();
 		final String vocabularyName = xdmValues[1].itemAt(0).getStringValue();
-		Optional<Term> cleanedValue = vocabularies
-			.get(vocabularyName)
-			.getTerms()
-			.stream()
-			.filter(it -> it.getNativeName().equalsIgnoreCase(currentValue))
-			.findAny();
+		Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue);

 		return new XdmAtomicValue(
-			cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue);
+			cleanedValue != null ? cleanedValue.getClassid() : currentValue);
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java
@ -0,0 +1,120 @@
+
+package eu.dnetlib.dhp.transformation.xslt;
+
+import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
+
+import java.io.Serializable;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import net.sf.saxon.s9api.*;
+
+public class DateCleaner implements ExtensionFunction, Serializable {
+
+	private final static List<Pattern> dateRegex = Arrays
+		.asList(
+			// Y-M-D
+			Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
+			// M-D-Y
+			Pattern
+				.compile(
+					"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
+					Pattern.MULTILINE),
+			// D-M-Y
+			Pattern
+				.compile(
+					"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
+					Pattern.MULTILINE),
+			// Y
+			Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE));
+
+	private final static Pattern incompleteDateRegex = Pattern
+		.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE);
+
+	private final static List<DateTimeFormatter> dformats = Arrays
+		.asList(
+			DateTimeFormatter
+				.ofPattern(
+					"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
+					Locale.ENGLISH),
+			DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN));
+
+	public String clean(final String inputDate) {
+
+		Optional<String> cleanedDate = dateRegex
+			.stream()
+			.map(
+				p -> {
+					final Matcher matcher = p.matcher(inputDate);
+					if (matcher.find())
+						return matcher.group(0);
+					else
+						return null;
+				})
+			.filter(Objects::nonNull)
+			.map(m -> {
+				Optional<String> cleanDate = dformats
+					.stream()
+					.map(f -> {
+						try {
+							LocalDate parsedDate = LocalDate.parse(m, f);
+							if (parsedDate != null)
+								return parsedDate.toString();
+							else
+								return null;
+						} catch (Throwable e) {
+							return null;
+						}
+					}
+
+					)
+					.filter(Objects::nonNull)
+					.findAny();
+
+				return cleanDate.orElse(null);
+			})
+			.filter(Objects::nonNull)
+			.findAny();
+
+		if (cleanedDate.isPresent())
+			return cleanedDate.get();
+
+		final Matcher matcher = incompleteDateRegex.matcher(inputDate);
+		if (matcher.find()) {
+			final Integer year = Integer.parseInt(matcher.group(1));
+			final Integer month = Integer.parseInt(matcher.group(4) == null ? "01" : matcher.group(4));
+			return String.format("%d-%02d-01", year, month);
+		}
+		return null;
+	}
+
+	@Override
+	public QName getName() {
+		return new QName(QNAME_BASE_URI + "/dateISO", "dateISO");
+	}
+
+	@Override
+	public SequenceType getResultType() {
+		return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
+	}
+
+	@Override
+	public SequenceType[] getArgumentTypes() {
+		return new SequenceType[] {
+			SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE)
+		};
+	}
+
+	@Override
+	public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException {
+		XdmValue r = xdmValues[0];
+		if (r.size() == 0) {
+			return new XdmAtomicValue("");
+		}
+		final String currentValue = xdmValues[0].itemAt(0).getStringValue();
+		return new XdmAtomicValue(clean(currentValue));
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java
@ -1,41 +1,39 @@

-package eu.dnetlib.dhp.transformation;
+package eu.dnetlib.dhp.transformation.xslt;

 import java.io.ByteArrayInputStream;
 import java.io.StringWriter;
-import java.util.Map;
+import java.nio.charset.StandardCharsets;

 import javax.xml.transform.stream.StreamSource;

+import org.apache.commons.io.IOUtils;
 import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.util.LongAccumulator;

-import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
-import eu.dnetlib.dhp.transformation.functions.Cleaner;
-import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
+import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
 import net.sf.saxon.s9api.*;

-public class TransformFunction implements MapFunction<MetadataRecord, MetadataRecord> {
+public class XSLTTransformationFunction implements MapFunction<MetadataRecord, MetadataRecord> {
+
+	public final static String QNAME_BASE_URI = "http://eu/dnetlib/transform";
+
+	private final AggregationCounter aggregationCounter;

-	private final LongAccumulator totalItems;
-	private final LongAccumulator errorItems;
-	private final LongAccumulator transformedItems;
 	private final String transformationRule;
+
 	private final Cleaner cleanFunction;

 	private final long dateOfTransformation;

-	public TransformFunction(
-		LongAccumulator totalItems,
-		LongAccumulator errorItems,
-		LongAccumulator transformedItems,
+	public XSLTTransformationFunction(
+		final AggregationCounter aggregationCounter,
 		final String transformationRule,
 		long dateOfTransformation,
-		final Map<String, Vocabulary> vocabularies)
+		final VocabularyGroup vocabularies)
 		throws Exception {
-		this.totalItems = totalItems;
-		this.errorItems = errorItems;
-		this.transformedItems = transformedItems;
+		this.aggregationCounter = aggregationCounter;
 		this.transformationRule = transformationRule;
 		this.dateOfTransformation = dateOfTransformation;
 		cleanFunction = new Cleaner(vocabularies);
@ -43,32 +41,35 @@ public class TransformFunction implements MapFunction<MetadataRecord, MetadataRe

 	@Override
 	public MetadataRecord call(MetadataRecord value) {
-		totalItems.add(1);
+		aggregationCounter.getTotalItems().add(1);
 		try {
 			Processor processor = new Processor(false);
 			processor.registerExtensionFunction(cleanFunction);
+			processor.registerExtensionFunction(new DateCleaner());
+
 			final XsltCompiler comp = processor.newXsltCompiler();
 			XsltExecutable xslt = comp
-				.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
+				.compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)));
 			XdmNode source = processor
 				.newDocumentBuilder()
-				.build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes())));
+				.build(new StreamSource(IOUtils.toInputStream(value.getBody(), StandardCharsets.UTF_8)));
 			XsltTransformer trans = xslt.load();
 			trans.setInitialContextNode(source);
 			final StringWriter output = new StringWriter();
 			Serializer out = processor.newSerializer(output);
 			out.setOutputProperty(Serializer.Property.METHOD, "xml");
 			out.setOutputProperty(Serializer.Property.INDENT, "yes");
+
 			trans.setDestination(out);
 			trans.transform();
 			final String xml = output.toString();
 			value.setBody(xml);
 			value.setDateOfTransformation(dateOfTransformation);
-			transformedItems.add(1);
+			aggregationCounter.getProcessedItems().add(1);
 			return value;
 		} catch (Throwable e) {
-			errorItems.add(1);
-			return null;
+			aggregationCounter.getErrorItems().add(1);
+			throw new RuntimeException(e);
 		}
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter
@ -0,0 +1,28 @@
+TUBYDI - Assistir Filmes e Series Online Grátis
+123Movies
+WATCH FULL MOVIE
+Movierulz
+Full Movie Online
+MOVIé WatcH
+The King of Staten Island 2020 Online For Free
+Watch Train to Busan 2 2020 online for free
+Sixth Sense Movie Novelization
+Film Complet streaming vf gratuit en ligne
+watch now free
+LIVE stream watch
+LIVE stream UFC
+RBC Heritage live stream
+MLBStreams Free
+NFL Live Stream
+Live Stream Free
+Royal Ascot 2020 Live Stream
+TV Shows Full Episodes Official
+FuboTV
+Gomovies
+Online Free Trial Access
+123watch
+DÜŞÜK HAPI
+Bebek Düşürme Yöntemleri
+WHATSAP İLETİŞİM
+Cytotec
+düşük hapı
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json
@ -0,0 +1,21 @@
+[
+  {
+    "paramName": "s",
+    "paramLongName": "sourcePath",
+    "paramDescription": "the source mdstore path",
+    "paramRequired": true
+  },
+
+  {
+    "paramName": "t",
+    "paramLongName": "targetPath",
+    "paramDescription": "the target mdstore path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "m",
+    "paramLongName": "master",
+    "paramDescription": "the master name",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json
@ -0,0 +1,33 @@
+[
+  {
+    "paramName": "s",
+    "paramLongName": "sourcePath",
+    "paramDescription": "the source mdstore path",
+    "paramRequired": true
+  },
+
+  {
+    "paramName": "t",
+    "paramLongName": "targetPath",
+    "paramDescription": "the target mdstore path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "tr",
+    "paramLongName": "transformationRule",
+    "paramDescription": "the transformation Rule",
+    "paramRequired": true
+  },
+  {
+    "paramName": "m",
+    "paramLongName": "master",
+    "paramDescription": "the master name",
+    "paramRequired": true
+  },
+  {
+    "paramName": "i",
+    "paramLongName": "isLookupUrl",
+    "paramDescription": "the isLookup URL",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json
@ -0,0 +1,33 @@
+[
+  {
+    "paramName": "t",
+    "paramLongName": "targetPath",
+    "paramDescription": "the path of the sequencial file to write",
+    "paramRequired": true
+  },
+
+  {
+    "paramName": "d",
+    "paramLongName": "dataciteDumpPath",
+    "paramDescription": "the path of the Datacite dump",
+    "paramRequired": true
+  },
+  {
+    "paramName": "s",
+    "paramLongName": "skipImport",
+    "paramDescription": "avoid to downlaod new items but apply the previous update",
+    "paramRequired": false
+  },
+  {
+    "paramName": "n",
+    "paramLongName": "namenode",
+    "paramDescription": "the hive metastore uris",
+    "paramRequired": true
+  },
+  {
+    "paramName": "m",
+    "paramLongName": "master",
+    "paramDescription": "the master name",
+    "paramRequired": true
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml
@ -0,0 +1,23 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml
@ -0,0 +1,129 @@
+<workflow-app name="Transformation_Workflow" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>mdstoreInputPath</name>
+            <description>the path of the input MDStore</description>
+        </property>
+
+        <property>
+            <name>mdstoreOutputPath</name>
+            <description>the path of the cleaned mdstore</description>
+        </property>
+        <property>
+            <name>nativeInputPath</name>
+            <description>the path of the input MDStore</description>
+        </property>
+        <property>
+            <name>skipimport</name>
+            <value>false</value>
+            <description>the path of the input MDStore</description>
+        </property>
+
+
+    </parameters>
+
+    <start to="resume_from"/>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <decision name="resume_from">
+        <switch>
+            <case to="TransformJob">${wf:conf('resumeFrom') eq 'TransformJob'}</case>
+            <case to="ExportDataset">${wf:conf('resumeFrom') eq 'ExportDataset'}</case>
+            <default to="ImportDatacite"/>
+        </switch>
+    </decision>
+
+    <action name="ImportDatacite">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>ImportDatacite</name>
+            <class>eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>-t</arg><arg>${nativeInputPath}</arg>
+            <arg>-d</arg><arg>${mdstoreInputPath}</arg>
+            <arg>-n</arg><arg>${nameNode}</arg>
+            <arg>-s</arg><arg>${skipimport}</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="TransformJob"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="TransformJob">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>TransformJob</name>
+            <class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${mdstoreInputPath}</arg>
+            <arg>--targetPath</arg><arg>${mdstoreOutputPath}</arg>
+            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+            <arg>-tr</arg><arg>${isLookupUrl}</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="DeletePathIfExists">
+        <fs>
+            <delete path='${mdstoreOutputPath}_raw_AS'/>
+        </fs>
+        <ok to="ExportDataset"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="ExportDataset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>ExportDataset</name>
+            <class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=3840
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${mdstoreOutputPath}</arg>
+            <arg>--targetPath</arg><arg>${mdstoreOutputPath}_raw_AS</arg>
+            <arg>--master</arg><arg>yarn-cluster</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json
@ -1,86 +0,0 @@
-[
-  {
-    "paramName": "issm",
-    "paramLongName": "isSparkSessionManaged",
-    "paramDescription": "when true will stop SparkSession after job execution",
-    "paramRequired": false
-  },
-  {
-    "paramName": "e",
-    "paramLongName": "encoding",
-    "paramDescription": "the encoding of the input record should be JSON or XML",
-    "paramRequired": true
-  },
-  {
-    "paramName": "d",
-    "paramLongName": "dateOfCollection",
-    "paramDescription": "the date when the record has been stored",
-    "paramRequired": true
-  },
-  {
-    "paramName": "p",
-    "paramLongName": "provenance",
-    "paramDescription": "the infos about the provenance of the collected records",
-    "paramRequired": true
-  },
-  {
-    "paramName": "x",
-    "paramLongName": "xpath",
-    "paramDescription": "the xpath to identify the record identifier",
-    "paramRequired": true
-  },
-  {
-    "paramName": "i",
-    "paramLongName": "input",
-    "paramDescription": "the path of the sequencial file to read",
-    "paramRequired": true
-  },
-  {
-    "paramName": "o",
-    "paramLongName": "output",
-    "paramDescription": "the path of the result DataFrame on HDFS",
-    "paramRequired": true
-  },
-  {
-    "paramName": "ru",
-    "paramLongName": "rabbitUser",
-    "paramDescription": "the user to connect with RabbitMq for messaging",
-    "paramRequired": true
-  },
-  {
-    "paramName": "rp",
-    "paramLongName": "rabbitPassword",
-    "paramDescription": "the password to connect with RabbitMq for messaging",
-    "paramRequired": true
-  },
-  {
-    "paramName": "rh",
-    "paramLongName": "rabbitHost",
-    "paramDescription": "the host of the RabbitMq server",
-    "paramRequired": true
-  },
-  {
-    "paramName": "ro",
-    "paramLongName": "rabbitOngoingQueue",
-    "paramDescription": "the name of the ongoing queue",
-    "paramRequired": true
-  },
-  {
-    "paramName": "rr",
-    "paramLongName": "rabbitReportQueue",
-    "paramDescription": "the name of the report queue",
-    "paramRequired": true
-  },
-  {
-    "paramName": "w",
-    "paramLongName": "workflowId",
-    "paramDescription": "the identifier of the dnet Workflow",
-    "paramRequired": true
-  },
-  {
-    "paramName": "t",
-    "paramLongName": "isTest",
-    "paramDescription": "the name of the report queue",
-    "paramRequired": false
-  }
-]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json
@ -0,0 +1,62 @@
+[
+  {
+    "paramName": "a",
+    "paramLongName": "apidescriptor",
+    "paramDescription": "the JSON encoding of the API Descriptor",
+    "paramRequired": true
+  },
+  {
+    "paramName": "n",
+    "paramLongName": "namenode",
+    "paramDescription": "the Name Node URI",
+    "paramRequired": true
+  },
+  {
+    "paramName": "mv",
+    "paramLongName": "mdStoreVersion",
+    "paramDescription": "the MDStore Version bean",
+    "paramRequired": true
+  },
+  {
+    "paramName": "dm",
+    "paramLongName": "dnetMessageManagerURL",
+    "paramDescription": "the End point URL to send Messages",
+    "paramRequired": true
+  },
+  {
+    "paramName": "w",
+    "paramLongName": "workflowId",
+    "paramDescription": "the identifier of the dnet Workflow",
+    "paramRequired": true
+  },
+  {
+    "paramName": "mnr",
+    "paramLongName": "maxNumberOfRetry",
+    "paramDescription": "the maximum number of admitted connection retries",
+    "paramRequired": false
+  },
+  {
+    "paramName": "rqd",
+    "paramLongName": "requestDelay",
+    "paramDescription": "the delay (ms) between requests",
+    "paramRequired": false
+  },
+  {
+    "paramName": "rtd",
+    "paramLongName": "retryDelay",
+    "paramDescription": "the delay (ms) between retries",
+    "paramRequired": false
+  },
+  {
+    "paramName": "cto",
+    "paramLongName": "connectTimeOut",
+    "paramDescription": "the maximum allowed time (ms) to connect to the remote host",
+    "paramRequired": false
+  },
+  {
+    "paramName": "rto",
+    "paramLongName": "readTimeOut",
+    "paramDescription": "the maximum allowed time (ms) to receive content from the remote host",
+    "paramRequired": false
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json
@ -0,0 +1,50 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "e",
+    "paramLongName": "encoding",
+    "paramDescription": "the encoding of the input record should be JSON or XML",
+    "paramRequired": true
+  },
+  {
+    "paramName": "d",
+    "paramLongName": "dateOfCollection",
+    "paramDescription": "the date when the record has been stored",
+    "paramRequired": true
+  },
+  {
+    "paramName": "p",
+    "paramLongName": "provenance",
+    "paramDescription": "the infos about the provenance of the collected records",
+    "paramRequired": true
+  },
+  {
+    "paramName": "x",
+    "paramLongName": "xpath",
+    "paramDescription": "the xpath to identify the record identifier",
+    "paramRequired": true
+  },
+  {
+    "paramName": "mv",
+    "paramLongName": "mdStoreVersion",
+    "paramDescription": "the Metadata Store Version Info",
+    "paramRequired": true
+  },
+  {
+    "paramName": "rmv",
+    "paramLongName": "readMdStoreVersion",
+    "paramDescription": "the Read Lock Metadata Store Version bean",
+    "paramRequired": false
+  },
+  {
+    "paramName": "w",
+    "paramLongName": "workflowId",
+    "paramDescription": "the identifier of the dnet Workflow",
+    "paramRequired": false
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json
@ -0,0 +1,45 @@
+[
+  {
+    "paramName": "a",
+    "paramLongName": "action",
+    "paramDescription": "the JSON encoding of the API Descriptor",
+    "paramRequired": true
+  },
+  {
+    "paramName": "mu",
+    "paramLongName": "mdStoreManagerURI",
+    "paramDescription": "the MDStore Manager URI",
+    "paramRequired": true
+  },
+  {
+    "paramName": "mi",
+    "paramLongName": "mdStoreID",
+    "paramDescription": "the Metadata Store ID",
+    "paramRequired": false
+  },
+  {
+    "paramName": "ms",
+    "paramLongName": "mdStoreSize",
+    "paramDescription": "the Metadata Store Size",
+    "paramRequired": false
+  },
+  {
+    "paramName": "mv",
+    "paramLongName": "mdStoreVersion",
+    "paramDescription": "the Metadata Version Bean",
+    "paramRequired": false
+  },
+  {
+    "paramName": "n",
+    "paramLongName": "namenode",
+    "paramDescription": "the Name Node URI",
+    "paramRequired": false
+  },
+  {
+    "paramName": "rm",
+    "paramLongName": "readMDStoreId",
+    "paramDescription": "the ID Locked to Read",
+    "paramRequired": false
+  }
+
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml
@ -0,0 +1,22 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml
@ -1,112 +1,212 @@
 <workflow-app name="CollectionWorkflow" xmlns="uri:oozie:workflow:0.5">
    <parameters>
-        <property>
-            <name>sequenceFilePath</name>
-            <description>the path to store the sequence file of the native metadata collected</description>
-        </property>
-
-        <property>
-            <name>mdStorePath</name>
-            <description>the path of the native mdstore</description>
-        </property>
-
        <property>
            <name>apiDescription</name>
            <description>A json encoding of the API Description class</description>
        </property>
-
        <property>
            <name>dataSourceInfo</name>
            <description>A json encoding of the Datasource Info</description>
        </property>
        <property>
            <name>identifierPath</name>
-            <description>An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier </description>
+            <description>An xpath to retrieve the metadata identifier for the generation of DNet Identifier </description>
        </property>
-
        <property>
            <name>metadataEncoding</name>
            <description> The type of the metadata XML/JSON</description>
        </property>
-
        <property>
            <name>timestamp</name>
            <description>The timestamp of the collection date</description>
        </property>
-
        <property>
            <name>workflowId</name>
            <description>The identifier of the workflow</description>
        </property>
+        <property>
+            <name>mdStoreID</name>
+            <description>The identifier of the mdStore</description>
+        </property>
+        <property>
+            <name>mdStoreManagerURI</name>
+            <description>The URI of the MDStore Manager</description>
+        </property>
+
+        <property>
+            <name>dnetMessageManagerURL</name>
+            <description>The URI of the Dnet Message Manager</description>
+        </property>
+        <property>
+            <name>collectionMode</name>
+            <description>Should be REFRESH or INCREMENTAL</description>
+        </property>
+
+        <property>
+            <name>collection_java_xmx</name>
+            <value>-Xmx200m</value>
+            <description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
+        </property>
+
+
    </parameters>

-    <start to="DeleteMDStoresNative"/>
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+    </global>
+
+    <start to="collection_mode"/>
+
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-    <action name="DeleteMDStoresNative">
-        <fs>
-            <mkdir path='${sequenceFilePath}'/>
-            <mkdir path='${mdStorePath}'/>
-            <delete path='${sequenceFilePath}'/>
-            <delete path='${mdStorePath}'/>
-        </fs>
+
+    <decision name="collection_mode">
+        <switch>
+            <case to="StartTransaction">${wf:conf('collectionMode') eq 'REFRESH'}</case>
+            <case to="BeginRead">${wf:conf('collectionMode') eq 'INCREMENTAL'}</case>
+            <default to="StartTransaction"/>
+        </switch>
+    </decision>
+
+    <action name="BeginRead">
+        <java>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <java-opts>${collection_java_xmx}</java-opts>
+            <arg>--action</arg><arg>READ_LOCK</arg>
+            <arg>--mdStoreID</arg><arg>${mdStoreID}</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+            <capture-output/>
+        </java>
+        <ok to="StartTransaction"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="StartTransaction">
+        <java>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <java-opts>${collection_java_xmx}</java-opts>
+            <arg>--action</arg><arg>NEW_VERSION</arg>
+            <arg>--mdStoreID</arg><arg>${mdStoreID}</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+            <capture-output/>
+        </java>
        <ok to="CollectionWorker"/>
        <error to="Kill"/>
    </action>
+
    <action name="CollectionWorker">
        <java>
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.dhp.collection.worker.DnetCollectorWorker</main-class>
-            <java-opts>-p</java-opts><java-opts>${sequenceFilePath}</java-opts>
-            <java-opts>-a</java-opts><java-opts>${apiDescription}</java-opts>
-            <java-opts>-n</java-opts><java-opts>${nameNode}</java-opts>
-            <java-opts>-rh</java-opts><java-opts>${rmq_host}</java-opts>
-            <java-opts>-ru</java-opts><java-opts>${rmq_user}</java-opts>
-            <java-opts>-rp</java-opts><java-opts>${rmq_pwd}</java-opts>
-            <java-opts>-rr</java-opts><java-opts>${rmq_report}</java-opts>
-            <java-opts>-ro</java-opts><java-opts>${rmq_ongoing}</java-opts>
-            <java-opts>-u</java-opts><java-opts>sandro.labruzzo</java-opts>
-            <java-opts>-w</java-opts><java-opts>${workflowId}</java-opts>
+            <main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
+            <java-opts>${collection_java_xmx}</java-opts>
+            <arg>--apidescriptor</arg><arg>${apiDescription}</arg>
+            <arg>--namenode</arg><arg>${nameNode}</arg>
+            <arg>--workflowId</arg><arg>${workflowId}</arg>
+            <arg>--dnetMessageManagerURL</arg><arg>${dnetMessageManagerURL}</arg>
+            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--maxNumberOfRetry</arg><arg>${maxNumberOfRetry}</arg>
+            <arg>--requestDelay</arg><arg>${requestDelay}</arg>
+            <arg>--retryDelay</arg><arg>${retryDelay}</arg>
+            <arg>--connectTimeOut</arg><arg>${connectTimeOut}</arg>
+            <arg>--readTimeOut</arg><arg>${readTimeOut}</arg>
        </java>
        <ok to="GenerateNativeStoreSparkJob"/>
-        <error to="Kill"/>
-    </action>
-    <action name="GenerateNativeStoreSparkJob">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>GenerateNativeStoreSparkJob</name>
-            <class>eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob</class>
-            <jar>dhp-aggregations-1.0.0-SNAPSHOT.jar</jar>
-            <spark-opts>--num-executors 50 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot;</spark-opts>
-            <arg>--encoding</arg> <arg>${metadataEncoding}</arg>
-            <arg>--dateOfCollection</arg> <arg>${timestamp}</arg>
-            <arg>--provenance</arg> <arg> ${dataSourceInfo}</arg>
-            <arg>--xpath</arg><arg>${identifierPath}</arg>
-            <arg>--input</arg><arg>${sequenceFilePath}</arg>
-            <arg>--output</arg><arg>${mdStorePath}</arg>
-            <arg>-rh</arg><arg>${rmq_host}</arg>
-            <arg>-ru</arg><arg>${rmq_user}</arg>
-            <arg>-rp</arg><arg>${rmq_pwd}</arg>
-            <arg>-rr</arg><arg>${rmq_report}</arg>
-            <arg>-ro</arg><arg>${rmq_ongoing}</arg>
-            <arg>-w</arg><arg>${workflowId}</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="DropInvalidStore"/>
+        <error to="FailCollection"/>
    </action>

-    <action name="DropInvalidStore">
-        <fs>
-            <delete path='${mdStorePath}/../'/>
-        </fs>
+    <action name="GenerateNativeStoreSparkJob">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Generate Native MetadataStore</name>
+            <class>eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--encoding</arg><arg>${metadataEncoding}</arg>
+            <arg>--dateOfCollection</arg><arg>${timestamp}</arg>
+            <arg>--provenance</arg><arg>${dataSourceInfo}</arg>
+            <arg>--xpath</arg><arg>${identifierPath}</arg>
+            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--readMdStoreVersion</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
+        </spark>
+        <ok to="collection_mode_end"/>
+        <error to="FailCollection"/>
+    </action>
+
+    <decision name="collection_mode_end">
+        <switch>
+            <case to="CommitVersion">${wf:conf('collectionMode') eq 'REFRESH'}</case>
+            <case to="EndRead">${wf:conf('collectionMode') eq 'INCREMENTAL'}</case>
+            <default to="CommitVersion"/>
+        </switch>
+    </decision>
+
+    <action name="EndRead">
+        <java>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <java-opts>${collection_java_xmx}</java-opts>
+            <arg>--action</arg><arg>READ_UNLOCK</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+            <arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
+        </java>
+        <ok to="CommitVersion"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="CommitVersion">
+        <java>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <java-opts>${collection_java_xmx}</java-opts>
+            <arg>--action</arg><arg>COMMIT</arg>
+            <arg>--namenode</arg><arg>${nameNode}</arg>
+            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+        </java>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <decision name="FailCollection">
+        <switch>
+            <case to="RollBack">${wf:conf('collectionMode') eq 'REFRESH'}</case>
+            <case to="EndReadRollBack">${wf:conf('collectionMode') eq 'INCREMENTAL'}</case>
+            <default to="RollBack"/>
+        </switch>
+    </decision>
+
+    <action name="EndReadRollBack">
+        <java>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <java-opts>${collection_java_xmx}</java-opts>
+            <arg>--action</arg><arg>READ_UNLOCK</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+            <arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
+        </java>
+        <ok to="RollBack"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="RollBack">
+        <java>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <java-opts>${collection_java_xmx}</java-opts>
+            <arg>--action</arg><arg>ROLLBACK</arg>
+            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+        </java>
        <ok to="Kill"/>
        <error to="Kill"/>
    </action>

    <end name="End"/>
+
 </workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json
@ -1,12 +0,0 @@
-[
-  {"paramName":"p",   "paramLongName":"hdfsPath",           "paramDescription": "the path where storing the sequential file",           "paramRequired": true},
-  {"paramName":"a",   "paramLongName":"apidescriptor",      "paramDescription": "the JSON encoding of the API Descriptor",              "paramRequired": true},
-  {"paramName":"n",   "paramLongName":"namenode",           "paramDescription": "the Name Node URI",                                    "paramRequired": true},
-  {"paramName":"u",   "paramLongName":"userHDFS",           "paramDescription": "the user wich create the hdfs seq file",               "paramRequired": true},
-  {"paramName":"ru",  "paramLongName":"rabbitUser",         "paramDescription": "the user to connect with RabbitMq for messaging",      "paramRequired": true},
-  {"paramName":"rp",  "paramLongName":"rabbitPassword",     "paramDescription": "the password to connect with RabbitMq for messaging",  "paramRequired": true},
-  {"paramName":"rh",  "paramLongName":"rabbitHost",         "paramDescription": "the host of the RabbitMq server",                      "paramRequired": true},
-  {"paramName":"ro",  "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue",                        "paramRequired": true},
-  {"paramName":"rr",  "paramLongName":"rabbitReportQueue",  "paramDescription": "the name of the report queue",                         "paramRequired": true},
-  {"paramName":"w",   "paramLongName":"workflowId",         "paramDescription": "the identifier of the dnet Workflow",                  "paramRequired": true}
-]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml
@ -0,0 +1,19 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+
+</configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml
@ -1,76 +1,187 @@
 <workflow-app name="Transformation_Workflow" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
-            <name>mdstoreInputPath</name>
-            <description>the path of the input MDStore</description>
+            <name>mdStoreInputId</name>
+            <description>the identifier of the native MDStore</description>
        </property>
-
        <property>
-            <name>mdstoreOutputPath</name>
+            <name>mdStoreOutputId</name>
+            <description>the identifier of the cleaned MDStore</description>
+        </property>
+        <property>
+            <name>mdStoreManagerURI</name>
            <description>the path of the cleaned mdstore</description>
        </property>
-
        <property>
-            <name>transformationRule</name>
+            <name>transformationRuleId</name>
            <description>The transformation Rule to apply</description>
        </property>
-
        <property>
-            <name>timestamp</name>
-            <description>The timestamp of the collection date</description>
+            <name>transformationPlugin</name>
+            <value>XSLT_TRANSFORM</value>
+            <description>The transformation Plugin</description>
+        </property>
+        <property>
+            <name>dateOfTransformation</name>
+            <description>The timestamp of the transformation date</description>
+        </property>
+        <property>
+            <name>isLookupUrl</name>
+            <description>The IS lookUp service endopoint</description>
        </property>
-
        <property>
            <name>workflowId</name>
            <description>The identifier of the workflow</description>
        </property>
+        <property>
+            <name>dnetMessageManagerURL</name>
+            <description>The URI of the Dnet Message Manager</description>
+        </property>
    </parameters>

-    <start to="DeletePathIfExists"/>
+    <start to="BeginRead"/>
+
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
-    <action name="DeletePathIfExists">
-        <fs>
-            <mkdir path='${mdstoreOutputPath}'/>
-            <delete path='${mdstoreOutputPath}'/>
-        </fs>
-        <ok to="TransformJob"/>
+
+    <action name="BeginRead">
+        <java>
+            <configuration>
+                <property>
+                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                    <value>true</value>
+                </property>
+            </configuration>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <arg>--action</arg><arg>READ_LOCK</arg>
+            <arg>--mdStoreID</arg><arg>${mdStoreInputId}</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+            <capture-output/>
+        </java>
+        <ok to="StartTransaction"/>
        <error to="Kill"/>
    </action>
-    <action name="TransformJob">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <job-tracker>${jobTracker}</job-tracker>
-            <name-node>${nameNode}</name-node>
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>MDBuilder</name>
-            <class>eu.dnetlib.dhp.transformation.TransformSparkJobNode</class>
-            <jar>dhp-aggregations-1.0.0-SNAPSHOT.jar</jar>
-            <spark-opts>--num-executors 50 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot;</spark-opts>
-            <arg>--dateOfCollection</arg> <arg>${timestamp}</arg>
-            <arg>-mt</arg> <arg>yarn</arg>
-            <arg>--input</arg><arg>${mdstoreInputPath}</arg>
-            <arg>--output</arg><arg>${mdstoreOutputPath}</arg>
-            <arg>-w</arg><arg>${workflowId}</arg>
-            <arg>-tr</arg><arg>${transformationRule}</arg>
-            <arg>-ru</arg><arg>${rmq_user}</arg>
-            <arg>-rp</arg><arg>${rmq_pwd}</arg>
-            <arg>-rh</arg><arg>${rmq_host}</arg>
-            <arg>-ro</arg><arg>${rmq_ongoing}</arg>
-            <arg>-rr</arg><arg>${rmq_report}</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="DropInvalidStore"/>
+
+    <action name="StartTransaction">
+        <java>
+            <configuration>
+                <property>
+                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                    <value>true</value>
+                </property>
+            </configuration>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <arg>--action</arg><arg>NEW_VERSION</arg>
+            <arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+            <capture-output/>
+        </java>
+        <ok to="TransformJob"/>
+        <error to="EndReadRollBack"/>
    </action>

-    <action name="DropInvalidStore">
-        <fs>
-            <delete path='${mdstoreOutputPath}/../'/>
-        </fs>
+    <action name="TransformJob">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Transform MetadataStore</name>
+            <class>eu.dnetlib.dhp.transformation.TransformSparkJobNode</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--mdstoreInputVersion</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
+            <arg>--dateOfTransformation</arg><arg>${dateOfTransformation}</arg>
+            <arg>--transformationPlugin</arg><arg>${transformationPlugin}</arg>
+            <arg>--transformationRuleId</arg><arg>${transformationRuleId}</arg>
+            <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
+            <arg>--workflowId</arg><arg>${workflowId}</arg>
+            <arg>--dnetMessageManagerURL</arg><arg>${dnetMessageManagerURL}</arg>
+        </spark>
+        <ok to="EndRead"/>
+        <error to="EndReadRollBack"/>
+    </action>
+
+    <action name="EndRead">
+        <java>
+            <configuration>
+                <property>
+                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                    <value>true</value>
+                </property>
+            </configuration>
+
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <arg>--action</arg><arg>READ_UNLOCK</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+            <arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
+            <capture-output/>
+        </java>
+        <ok to="CommitVersion"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="CommitVersion">
+        <java>
+            <configuration>
+                <property>
+                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                    <value>true</value>
+                </property>
+            </configuration>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <arg>--action</arg><arg>COMMIT</arg>
+            <arg>--namenode</arg><arg>${nameNode}</arg>
+            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+        </java>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="EndReadRollBack">
+        <java>
+            <configuration>
+                <property>
+                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                    <value>true</value>
+                </property>
+            </configuration>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <arg>--action</arg><arg>READ_UNLOCK</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+            <arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
+            <capture-output/>
+            </java>
+        <ok to="RollBack"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="RollBack">
+        <java>
+            <configuration>
+                <property>
+                    <name>oozie.launcher.mapreduce.user.classpath.first</name>
+                    <value>true</value>
+                </property>
+            </configuration>
+            <main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
+            <arg>--action</arg><arg>ROLLBACK</arg>
+            <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
+            <arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
+        </java>
        <ok to="Kill"/>
        <error to="Kill"/>
    </action>
    
    <end name="End"/>
+
 </workflow-app>
--- a/Show More
+++ b/Show More