Merge branch 'stable_ids' of code-repo.d4science.org:D-Net/dnet-hadoop into stable_ids

2021-04-14 18:06:34 +02:00 · 2021-04-14 18:06:34 +02:00 · 176a5e493d
parent 3525a8f504 745fa92db8
commit 176a5e493d
20 changed files with 905 additions and 862 deletions
--- a/100.patch
+++ b/100.patch
@ -1,757 +0,0 @@
-From c5fbad8093ca27deebf1b5fd5ffd39e1877c533d Mon Sep 17 00:00:00 2001
-From: antleb <antleb@di.uoa.gr>
-Date: Thu, 4 Mar 2021 00:42:21 +0200
-Subject: [PATCH 1/8] Contexts are now downloaded instead of using the
- stats_ext db
-
---
- .../dhp/oa/graph/stats/oozie_app/contexts.sh  | 33 +++++++++++++++++++
- .../graph/stats/oozie_app/scripts/step10.sql  | 13 --------
- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 17 ++++++++++
- 3 files changed, 50 insertions(+), 13 deletions(-)
- create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-new file mode 100644
-index 00000000..f06a43bb
--- /dev/null
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-@@ -0,0 +1,33 @@
-+#!/usr/bin/env bash
-+
-+CONTEXT_API=$1
-+TARGET_DB=$2
-+
-+TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6`
-+
-+echo "Downloading context data"
-+curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
-+cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
-+cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
-+cat contexts.csv  | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv
-+cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv
-+
-+echo "uploading context data to hdfs"
-+hdfs dfs -mkdir ${TMP}
-+hdfs dfs -copyFromLocal contexts.csv ${TMP}
-+hdfs dfs -copyFromLocal categories.csv ${TMP}
-+hdfs dfs -copyFromLocal concepts.csv ${TMP}
-+hdfs dfs -chmod -R 777 ${TMP}
-+
-+echo "Creating and populating impala tables"
-+impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';"
-+impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';"
-+impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';"
-+impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;"
-+impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;"
-+impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;"
-+
-+echo "Cleaning up"
-+hdfs dfs -rm -f -r -skipTrash ${TMP}
-+
-+echo "Finito!"
-\ No newline at end of file
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
-index 6c96317e..77fbd3b1 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql
-@@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
- SELECT *
- FROM ${external_stats_db_name}.rndexpediture;
- 
-CREATE OR REPLACE VIEW ${stats_db_name}.context AS
-SELECT *
-FROM ${external_stats_db_name}.context;
-
-CREATE OR REPLACE VIEW ${stats_db_name}.category AS
-SELECT *
-FROM ${external_stats_db_name}.category;
-
-CREATE OR REPLACE VIEW ${stats_db_name}.concept AS
-SELECT *
-FROM ${external_stats_db_name}.concept;
-
-
- ------------------------------------------------------------------------------------------------
- ------------------------------------------------------------------------------------------------
- -- Creation date of the database
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-index 9c16f149..afb10c41 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-@@ -41,6 +41,10 @@
-             <name>hive_timeout</name>
-             <description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
-         </property>
-+        <property>
-+            <name>context_api_url</name>
-+            <description>the base url of the context api (https://services.openaire.eu/openaire)</description>
-+        </property>
-     </parameters>
- 
-     <global>
-@@ -263,6 +267,19 @@
-         <ok to="Step19"/>
-         <error to="Kill"/>
-     </action>
-+
-+    <action name="Step17">
-+        <shell xmlns="uri:oozie:shell-action:0.1">
-+            <job-tracker>${jobTracker}</job-tracker>
-+            <name-node>${nameNode}</name-node>
-+            <exec>contexts.sh</exec>
-+            <argument>${context_api_url}</argument>
-+            <argument>${stats_db_name}</argument>
-+            <file>contexts.sh</file>
-+        </shell>
-+        <ok to="step20-createMonitorDB"/>
-+        <error to="Kill"/>
-+    </action>
-         
-     <action name="Step19">
-         <shell xmlns="uri:oozie:shell-action:0.1">
-- 
-2.17.1
-
-
-From 6147ee495053634436abe822aaf9ba909813d8c4 Mon Sep 17 00:00:00 2001
-From: antleb <antleb@di.uoa.gr>
-Date: Fri, 5 Mar 2021 14:12:18 +0200
-Subject: [PATCH 2/8] assigning correctly hive contexts to concepts
-
---
- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh    | 7 +++++--
- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql | 5 ++++-
- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql | 5 ++++-
- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql | 5 ++++-
- .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql | 5 ++++-
- 5 files changed, 21 insertions(+), 6 deletions(-)
-
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-index f06a43bb..6788f88b 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-@@ -9,8 +9,8 @@ echo "Downloading context data"
- curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
- cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
- cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
-cat contexts.csv  | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv
-cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv
-+cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv
-+cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
- 
- echo "uploading context data to hdfs"
- hdfs dfs -mkdir ${TMP}
-@@ -29,5 +29,8 @@ impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.
- 
- echo "Cleaning up"
- hdfs dfs -rm -f -r -skipTrash ${TMP}
-+rm concepts.csv
-+rm categories.csv
-+rm contexts.csv
- 
- echo "Finito!"
-\ No newline at end of file
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
-index 62a15856..75b24b18 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
-@@ -47,7 +47,10 @@ from ${openaire_db_name}.publication p
- where p.datainfo.deletedbyinference = false;
- 
- CREATE TABLE ${stats_db_name}.publication_concepts AS
-SELECT substr(p.id, 4) as id, contexts.context.id as concept
-+SELECT substr(p.id, 4) as id, case
-+    when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
-+    when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
-+    when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
- from ${openaire_db_name}.publication p
-          LATERAL VIEW explode(p.context) contexts as context
- where p.datainfo.deletedbyinference = false;
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
-index dcd5ad85..540cc03a 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
-@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.dataset p
- where p.datainfo.deletedbyinference = false;
- 
- CREATE TABLE ${stats_db_name}.dataset_concepts AS
-SELECT substr(p.id, 4) as id, contexts.context.id as concept
-+SELECT substr(p.id, 4) as id, case
-+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
-+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
-+                                  when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
- from ${openaire_db_name}.dataset p
-          LATERAL VIEW explode(p.context) contexts as context
- where p.datainfo.deletedbyinference = false;
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
-index fd5390e6..54345e07 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
-@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.software p
- where p.datainfo.deletedbyinference = false;
- 
- CREATE TABLE ${stats_db_name}.software_concepts AS
-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept
-+SELECT substr(p.id, 4) as id, case
-+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
-+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
-+                                  when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
- FROM ${openaire_db_name}.software p
-          LATERAL VIEW explode(p.context) contexts AS context
- where p.datainfo.deletedbyinference = false;
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
-index b359b596..36ad5d92 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
-@@ -52,7 +52,10 @@ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.
- where p.datainfo.deletedbyinference = false;
- 
- CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS
-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept
-+SELECT substr(p.id, 4) as id, case
-+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
-+                                  when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
-+                                  when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
- FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
- where p.datainfo.deletedbyinference = false;
- 
-- 
-2.17.1
-
-
-From f40c150a0d549e2dbcfd42ecf81e17ad4b505391 Mon Sep 17 00:00:00 2001
-From: antleb <antleb@di.uoa.gr>
-Date: Sat, 6 Mar 2021 00:35:57 +0200
-Subject: [PATCH 3/8] fixed steps...
-
---
- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml      | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-index afb10c41..2184cb8a 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-@@ -264,7 +264,7 @@
-             <param>stats_db_name=${stats_db_name}</param>
-             <param>openaire_db_name=${openaire_db_name}</param>
-         </hive2>
-        <ok to="Step19"/>
-+        <ok to="Step17"/>
-         <error to="Kill"/>
-     </action>
- 
-@@ -277,7 +277,7 @@
-             <argument>${stats_db_name}</argument>
-             <file>contexts.sh</file>
-         </shell>
-        <ok to="step20-createMonitorDB"/>
-+        <ok to="step19"/>
-         <error to="Kill"/>
-     </action>
-         
-- 
-2.17.1
-
-
-From fa1ec5b5e9b6038b3b565422af5c6406f21220d3 Mon Sep 17 00:00:00 2001
-From: antleb <antleb@di.uoa.gr>
-Date: Wed, 10 Mar 2021 14:05:58 +0200
-Subject: [PATCH 4/8] fixed typo...
-
---
- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml        | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-index 2184cb8a..321500e2 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-@@ -277,7 +277,7 @@
-             <argument>${stats_db_name}</argument>
-             <file>contexts.sh</file>
-         </shell>
-        <ok to="step19"/>
-+        <ok to="Step19"/>
-         <error to="Kill"/>
-     </action>
-         
-- 
-2.17.1
-
-
-From 3c75a050443942b632cf8469b5af16a8c61e7569 Mon Sep 17 00:00:00 2001
-From: antleb <antleb@di.uoa.gr>
-Date: Fri, 12 Mar 2021 13:47:04 +0200
-Subject: [PATCH 5/8] fixed a ton of typos
-
---
- .../scripts/computeProductionStats.sql        |  8 -------
- .../stats/oozie_app/updateProductionViews.sh  | 18 ++++++++++++++++
- .../dhp/oa/graph/stats/oozie_app/contexts.sh  | 21 ++++++++++++-------
- 3 files changed, 32 insertions(+), 15 deletions(-)
- delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
- create mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
-
-diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
-deleted file mode 100644
-index 34e48a18..00000000
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql
-+++ /dev/null
-@@ -1,8 +0,0 @@
-------------------------------------------------------
-------------------------------------------------------
--- Impala table statistics - Needed to make the tables
--- visible for impala
-------------------------------------------------------
-------------------------------------------------------
-
-INVALIDATE METADATA ${stats_db_name};
-diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
-new file mode 100644
-index 00000000..57acb2ee
--- /dev/null
-+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
-@@ -0,0 +1,18 @@
-+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
-+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
-+if ! [ -L $link_folder ]
-+then
-+    rm -Rf "$link_folder"
-+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
-+fi
-+
-+export SOURCE=$1
-+export SHADOW=$2
-+
-+echo "Updating shadow database"
-+impala-shell -d ${SOURCE} -q "invalidate metadata"
-+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f -
-+impala-shell -q "create database if not exists ${SHADOW}"
-+impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
-+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
-+echo "Shadow db ready!"
-\ No newline at end of file
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-index 6788f88b..c28be50d 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-@@ -1,4 +1,10 @@
-#!/usr/bin/env bash
-+export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
-+export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
-+if ! [ -L $link_folder ]
-+then
-+    rm -Rf "$link_folder"
-+    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
-+fi
- 
- CONTEXT_API=$1
- TARGET_DB=$2
-@@ -20,12 +26,13 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP}
- hdfs dfs -chmod -R 777 ${TMP}
- 
- echo "Creating and populating impala tables"
-impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';"
-impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';"
-impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';"
-impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;"
-impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;"
-impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;"
-+impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
-+impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
-+impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
-+impala-shell -d ${TARGET_DB} -q "invalidate metadata"
-+impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
-+impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
-+impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
- 
- echo "Cleaning up"
- hdfs dfs -rm -f -r -skipTrash ${TMP}
-- 
-2.17.1
-
-
-From 236435b47010ea1ab94c3f018dcf278f5d2c44aa Mon Sep 17 00:00:00 2001
-From: antleb <antleb@di.uoa.gr>
-Date: Fri, 12 Mar 2021 14:11:21 +0200
-Subject: [PATCH 6/8] following redirects
-
---
- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh     | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-index c28be50d..29b225e3 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh
-@@ -12,9 +12,9 @@ TARGET_DB=$2
- TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6`
- 
- echo "Downloading context data"
-curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
-cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
-cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
-+curl -L ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
-+cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
-+cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
- cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv
- cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
- 
-- 
-2.17.1
-
-
-From 60ebdf2dbe704733809f401df70bffcf49cede29 Mon Sep 17 00:00:00 2001
-From: antleb <antleb@di.uoa.gr>
-Date: Fri, 12 Mar 2021 16:34:53 +0200
-Subject: [PATCH 7/8] update promote wf to support monitor&production
-
---
- .../oa/graph/stats/oozie_app/impala-shell.sh  |  18 --
- .../scripts/updateProductionViews.sql         | 207 ------------------
- 2 files changed, 225 deletions(-)
- delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
- delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
-
-diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
-deleted file mode 100644
-index 70112dc7..00000000
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh
-+++ /dev/null
-@@ -1,18 +0,0 @@
-export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
-export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
-if ! [ -L $link_folder ]
-then
-    rm -Rf "$link_folder"
-    ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
-fi
-
-echo "Getting file from " $3
-hdfs dfs -copyToLocal $3
-
-echo "Running impala shell make the new database visible"
-impala-shell -q "INVALIDATE METADATA;"
-
-echo "Running impala shell to compute new table stats"
-impala-shell -d $1 -f $2
-echo "Impala shell finished"
-rm $2
-diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
-deleted file mode 100644
-index 48f8d58f..00000000
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql
-+++ /dev/null
-@@ -1,207 +0,0 @@
-------------------------------------------------------
-------------------------------------------------------
--- Shadow schema table exchange
-------------------------------------------------------
-------------------------------------------------------
-
--- Dropping old views
-DROP VIEW IF EXISTS ${stats_db_production_name}.category;
-DROP VIEW IF EXISTS ${stats_db_production_name}.concept;
-DROP VIEW IF EXISTS ${stats_db_production_name}.context;
-DROP VIEW IF EXISTS ${stats_db_production_name}.country;
-DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp;
-DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results;
-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.funder;
-DROP VIEW IF EXISTS ${stats_db_production_name}.fundref;
-DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects;
-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount;
-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics;
-DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture;
-DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources;
-DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics;
-
-
--- Creating the shadow database, in case it doesn't exist
-CREATE database IF NOT EXISTS ${stats_db_production_name};
-
--- Creating new views
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources;
-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics;
-- 
-2.17.1
-
-
-From 0ba0a6b9dac25f5ec73e8eafefbf7f91442ad1c5 Mon Sep 17 00:00:00 2001
-From: antleb <antleb@di.uoa.gr>
-Date: Fri, 12 Mar 2021 16:42:59 +0200
-Subject: [PATCH 8/8] update promote wf to support monitor&production
-
---
- .../stats/oozie_app/updateProductionViews.sh  | 14 +++----
- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 37 ++++++++++++-------
- 2 files changed, 29 insertions(+), 22 deletions(-)
-
-diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
-index 57acb2ee..3e510e87 100644
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
-+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh
-@@ -7,12 +7,10 @@ then
- fi
- 
- export SOURCE=$1
-export SHADOW=$2
-+export PRODUCTION=$2
- 
-echo "Updating shadow database"
-impala-shell -d ${SOURCE} -q "invalidate metadata"
-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f -
-impala-shell -q "create database if not exists ${SHADOW}"
-impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f -
-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
-echo "Shadow db ready!"
-\ No newline at end of file
-+echo "Updating ${PRODUCTION} database"
-+impala-shell -q "create database if not exists ${PRODUCTION}"
-+impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
-+impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
-+echo "Production db ready!"
-\ No newline at end of file
-diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-index d744f18d..0d8ff7ee 100644
--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-+++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml
-@@ -6,7 +6,15 @@
-         </property>
-         <property>
-             <name>stats_db_production_name</name>
-            <description>the name of the production schema</description>
-+            <description>the name of the public production schema</description>
-+        </property>
-+        <property>
-+            <name>monitor_db_name</name>
-+            <description>the monitor database name</description>
-+        </property>
-+        <property>
-+            <name>monitor_db_production_name</name>
-+            <description>the name of the monitor public database</description>
-         </property>
-         <property>
-             <name>stats_tool_api_url</name>
-@@ -48,25 +56,26 @@
-     </kill>
- 
-     <action name="updateProductionViews">
-        <hive2 xmlns="uri:oozie:hive2-action:0.1">
-            <jdbc-url>${hive_jdbc_url}</jdbc-url>
-            <script>scripts/updateProductionViews.sql</script>
-			<param>stats_db_name=${stats_db_name}</param>
-			<param>stats_db_production_name=${stats_db_production_name}</param>
-        </hive2>
-        <ok to="computeProductionStats"/>
-+        <shell xmlns="uri:oozie:shell-action:0.1">
-+            <job-tracker>${jobTracker}</job-tracker>
-+            <name-node>${nameNode}</name-node>
-+            <exec>updateProductionViews.sh</exec>
-+            <argument>${stats_db_name}</argument>
-+            <argument>${stats_db_production_name}</argument>
-+            <file>updateProductionViews.sh</file>
-+        </shell>
-+        <ok to="updateMonitorViews"/>
-         <error to="Kill"/>
-     </action>
- 
-    <action name="computeProductionStats">
-+    <action name="updateMonitorViews">
-         <shell xmlns="uri:oozie:shell-action:0.1">
-             <job-tracker>${jobTracker}</job-tracker>
-             <name-node>${nameNode}</name-node>
-            <exec>impala-shell.sh</exec>
-            <argument>${stats_db_production_name}</argument>
-            <argument>computeProductionStats.sql</argument>
-            <argument>${wf:appPath()}/scripts/computeProductionStats.sql</argument>
-            <file>impala-shell.sh</file>
-+            <exec>updateProductionViews.sh</exec>
-+            <argument>${monitor_db_name}</argument>
-+            <argument>${monitor_db_production_name}</argument>
-+            <file>updateProductionViews.sh</file>
-         </shell>
-         <ok to="promoteCache"/>
-         <error to="Kill"/>
-- 
-2.17.1
-
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java
@ -42,6 +42,8 @@ public class Constants {
 	public static final String RETRY_DELAY = "retryDelay";
 	public static final String CONNECT_TIMEOUT = "connectTimeOut";
 	public static final String READ_TIMEOUT = "readTimeOut";
+	public static final String FROM_DATE_OVERRIDE = "fromDateOverride";
+	public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride";

 	public static final String CONTENT_TOTALITEMS = "TotalItems";
 	public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf;
 import java.util.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;

 import org.apache.commons.lang3.StringUtils;

@ -22,6 +23,9 @@ public class CleaningFunctions {
 	public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";

 	public static final Set<String> PID_BLACKLIST = new HashSet<>();
+	public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
+	public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
+	public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;

 	static {
 		PID_BLACKLIST.add("none");
@ -80,6 +84,36 @@ public class CleaningFunctions {
 		return value;
 	}

+	public static <T extends Oaf> boolean filter(T value) {
+		if (value instanceof Datasource) {
+			// nothing to evaluate here
+		} else if (value instanceof Project) {
+			// nothing to evaluate here
+		} else if (value instanceof Organization) {
+			// nothing to evaluate here
+		} else if (value instanceof Relation) {
+			// nothing to clean here
+		} else if (value instanceof Result) {
+
+			Result r = (Result) value;
+
+			if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
+				return false;
+			}
+
+			if (value instanceof Publication) {
+
+			} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
+
+			} else if (value instanceof OtherResearchProduct) {
+
+			} else if (value instanceof Software) {
+
+			}
+		}
+		return true;
+	}
+
 	public static <T extends Oaf> T cleanup(T value) {
 		if (value instanceof Datasource) {
 			// nothing to clean here
@ -124,6 +158,12 @@ public class CleaningFunctions {
 							.stream()
 							.filter(Objects::nonNull)
 							.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
+							.filter(
+								sp -> sp
+									.getValue()
+									.toLowerCase()
+									.replaceAll(TITLE_FILTER_REGEX, "")
+									.length() > TITLE_FILTER_RESIDUAL_LENGTH)
 							.map(CleaningFunctions::cleanValue)
 							.collect(Collectors.toList()));
 			}
@ -199,16 +239,7 @@ public class CleaningFunctions {
 				}
 			}
 			if (Objects.nonNull(r.getAuthor())) {
-				boolean nullRank = r
-					.getAuthor()
-					.stream()
-					.anyMatch(a -> Objects.isNull(a.getRank()));
-				if (nullRank) {
-					int i = 1;
-					for (Author author : r.getAuthor()) {
-						author.setRank(i++);
-					}
-				}
+				final List<Author> authors = Lists.newArrayList();
 				for (Author a : r.getAuthor()) {
 					if (Objects.isNull(a.getPid())) {
 						a.setPid(Lists.newArrayList());
@ -235,7 +266,26 @@ public class CleaningFunctions {
 									.stream()
 									.collect(Collectors.toList()));
 					}
+					if (StringUtils.isBlank(a.getFullname())) {
+						if (StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname())) {
+							a.setFullname(a.getSurname() + ", " + a.getName());
 						}
+					}
+					if (StringUtils.isNotBlank(a.getFullname()) && isValidAuthorName(a)) {
+						authors.add(a);
+					}
+				}
+
+				boolean nullRank = authors
+					.stream()
+					.anyMatch(a -> Objects.isNull(a.getRank()));
+				if (nullRank) {
+					int i = 1;
+					for (Author author : authors) {
+						author.setRank(i++);
+					}
+				}
+				r.setAuthor(authors);

 			}
 			if (value instanceof Publication) {
@ -252,6 +302,15 @@ public class CleaningFunctions {
 		return value;
 	}

+	private static boolean isValidAuthorName(Author a) {
+		return !Stream
+			.of(a.getFullname(), a.getName(), a.getSurname())
+			.filter(s -> s != null && !s.isEmpty())
+			.collect(Collectors.joining(""))
+			.toLowerCase()
+			.matches(INVALID_AUTHOR_REGEX);
+	}
+
 	private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
 		return pids
 			.stream()
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java
@ -0,0 +1,206 @@
+
+package eu.dnetlib.dhp.transformation.xslt;
+
+import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
+
+import java.io.Serializable;
+// import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.text.Normalizer;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.hash.Hashing;
+
+import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize;
+import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations;
+import net.sf.saxon.s9api.ExtensionFunction;
+import net.sf.saxon.s9api.ItemType;
+import net.sf.saxon.s9api.OccurrenceIndicator;
+import net.sf.saxon.s9api.QName;
+import net.sf.saxon.s9api.SaxonApiException;
+import net.sf.saxon.s9api.SequenceType;
+import net.sf.saxon.s9api.XdmValue;
+
+//import eu.dnetlib.pace.clustering.NGramUtils;
+//import eu.dnetlib.pace.util.Capitalise;
+//import eu.dnetlib.pace.util.DotAbbreviations;
+
+public class PersonCleaner implements ExtensionFunction, Serializable {
+	/**
+	 *
+	 */
+	private static final long serialVersionUID = 1L;
+	private List<String> firstname = Lists.newArrayList();
+	private List<String> surname = Lists.newArrayList();
+	private List<String> fullname = Lists.newArrayList();
+
+	private static Set<String> particles = null;
+
+	public PersonCleaner() {
+
+	}
+
+	public String normalize(String s) {
+		s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
+		s = s.replaceAll("\\(.+\\)", "");
+		s = s.replaceAll("\\[.+\\]", "");
+		s = s.replaceAll("\\{.+\\}", "");
+		s = s.replaceAll("\\s+-\\s+", "-");
+
+//              s = s.replaceAll("[\\W&&[^,-]]", " ");
+
+//              System.out.println("class Person: s: " + s);
+
+//              s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " ");
+		s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
+		s = s.replace("\\d", " ");
+		s = s.replace("\\n", " ");
+		s = s.replace("\\.", " ");
+		s = s.replaceAll("\\s+", " ");
+
+		if (s.contains(",")) {
+			// System.out.println("class Person: s: " + s);
+
+			String[] arr = s.split(",");
+			if (arr.length == 1) {
+
+				fullname = splitTerms(arr[0]);
+			} else if (arr.length > 1) {
+				surname = splitTerms(arr[0]);
+				firstname = splitTermsFirstName(arr[1]);
+//                              System.out.println("class Person: surname: " + surname);
+//                              System.out.println("class Person: firstname: " + firstname);
+
+				fullname.addAll(surname);
+				fullname.addAll(firstname);
+			}
+		} else {
+			fullname = splitTerms(s);
+
+			int lastInitialPosition = fullname.size();
+			boolean hasSurnameInUpperCase = false;
+
+			for (int i = 0; i < fullname.size(); i++) {
+				String term = fullname.get(i);
+				if (term.length() == 1) {
+					lastInitialPosition = i;
+				} else if (term.equals(term.toUpperCase())) {
+					hasSurnameInUpperCase = true;
+				}
+			}
+			if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
+				firstname = fullname.subList(0, lastInitialPosition + 1);
+				System.out.println("name: " + firstname);
+				surname = fullname.subList(lastInitialPosition + 1, fullname.size());
+			} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
+				for (String term : fullname) {
+					if (term.length() > 1 && term.equals(term.toUpperCase())) {
+						surname.add(term);
+					} else {
+						firstname.add(term);
+					}
+				}
+			} else if (lastInitialPosition == fullname.size()) {
+				surname = fullname.subList(lastInitialPosition - 1, fullname.size());
+				firstname = fullname.subList(0, lastInitialPosition - 1);
+			}
+
+		}
+		return null;
+	}
+
+	private List<String> splitTermsFirstName(String s) {
+		List<String> list = Lists.newArrayList();
+		for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
+			if (s.trim().matches("\\p{Lu}{2,3}")) {
+				String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
+				for (String p : parts) {
+					if (p.length() > 0)
+						list.add(p);
+				}
+			} else {
+				list.add(part);
+			}
+
+		}
+		return list;
+	}
+
+	private List<String> splitTerms(String s) {
+		if (particles == null) {
+			// particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
+		}
+
+		List<String> list = Lists.newArrayList();
+		for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
+			// if (!particles.contains(part.toLowerCase())) {
+			list.add(part);
+
+			// }
+		}
+		return list;
+	}
+
+	public List<String> getFirstname() {
+		return firstname;
+	}
+
+	public List<String> getSurname() {
+		return surname;
+	}
+
+	public List<String> getFullname() {
+		return fullname;
+	}
+
+	public String hash() {
+		return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString();
+	}
+
+	public String getNormalisedFullname() {
+		return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations())
+			: Joiner.on(" ").join(fullname);
+		// return isAccurate() ?
+		// Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) :
+		// Joiner.on(" ").join(fullname);
+	}
+
+	public List<String> getCapitalSurname() {
+		return Lists.newArrayList(Iterables.transform(surname, new Capitalize()));
+	}
+
+	public List<String> getNameWithAbbreviations() {
+		return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations()));
+	}
+
+	public boolean isAccurate() {
+		return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty());
+	}
+
+	@Override
+	public QName getName() {
+		return new QName(QNAME_BASE_URI + "/person", "person");
+	}
+
+	@Override
+	public SequenceType getResultType() {
+		return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
+	}
+
+	@Override
+	public SequenceType[] getArgumentTypes() {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	@Override
+	public XdmValue call(XdmValue[] arguments) throws SaxonApiException {
+		// TODO Auto-generated method stub
+		return null;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java
@ -46,6 +46,7 @@ public class XSLTTransformationFunction implements MapFunction<MetadataRecord, M
 			Processor processor = new Processor(false);
 			processor.registerExtensionFunction(cleanFunction);
 			processor.registerExtensionFunction(new DateCleaner());
+			processor.registerExtensionFunction(new PersonCleaner());

 			final XsltCompiler comp = processor.newXsltCompiler();
 			XsltExecutable xslt = comp
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java
@ -0,0 +1,14 @@
+
+package eu.dnetlib.dhp.transformation.xslt.utils;
+
+// import org.apache.commons.text.WordUtils;
+// import org.apache.commons.text.WordUtils;
+import com.google.common.base.Function;
+
+public class Capitalize implements Function<String, String> {
+
+	@Override
+	public String apply(String s) {
+		return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase());
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java
@ -0,0 +1,12 @@
+
+package eu.dnetlib.dhp.transformation.xslt.utils;
+
+import com.google.common.base.Function;
+
+public class DotAbbreviations implements Function<String, String> {
+
+	@Override
+	public String apply(String s) {
+		return s.length() == 1 ? s + "." : s;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java
@ -92,15 +92,18 @@ public class TransformationJobTest extends AbstractVocabularyTest {
 	}

 	@Test
-	@DisplayName("Test Transform Inst.&Them.v4 record XML with xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid")
-	public void testTransformITGv4() throws Exception {
+	@DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite/oaiOpenAIRE")
+	public void testTransformMostlyUsedScript() throws Exception {
+
+		String xslTransformationScript = "";
+		xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl";
+		xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl";

 		// We Set the input Record getting the XML from the classpath
 		final MetadataRecord mr = new MetadataRecord();
 		mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
 		// We Load the XSLT transformation Rule from the classpath
-		XSLTTransformationFunction tr = loadTransformationRule(
-			"/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl");
+		XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);

 		MetadataRecord result = tr.call(mr);

@ -110,15 +113,17 @@ public class TransformationJobTest extends AbstractVocabularyTest {
 	}

 	@Test
-	@DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite")
-	public void testTransformMostlyUsedScript() throws Exception {
+	@DisplayName("Test Transform record XML with xslt_cleaning_REST_OmicsDI")
+	public void testTransformRestScript() throws Exception {
+
+		String xslTransformationScript = "";
+		xslTransformationScript = "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl";

 		// We Set the input Record getting the XML from the classpath
 		final MetadataRecord mr = new MetadataRecord();
-		mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml")));
+		mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_omicsdi.xml")));
 		// We Load the XSLT transformation Rule from the classpath
-		XSLTTransformationFunction tr = loadTransformationRule(
-			"/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl");
+		XSLTTransformationFunction tr = loadTransformationRule(xslTransformationScript);

 		MetadataRecord result = tr.call(mr);

--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_omicsdi.xml
@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<oai:record xmlns="http://namespace.openaire.eu/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+  xmlns:oaf="http://namespace.openaire.eu/oaf"
+  xmlns:oai="http://www.openarchives.org/OAI/2.0/"
+  xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <oai:header>
+    <dri:objIdentifier>_____OmicsDI::0000337c02d1b51030675d69407655da</dri:objIdentifier>
+    <dri:recordIdentifier>PRJNA78295</dri:recordIdentifier>
+    <dri:dateOfCollection>2020-10-31T15:31:30.725Z</dri:dateOfCollection>
+    <oaf:datasourceprefix>_____OmicsDI</oaf:datasourceprefix>
+  </oai:header>
+  <oai:metadata>
+    <datasets xmlns="">
+      <connectionsCountScaled>0.235294117647059</connectionsCountScaled>
+      <reanalysisCount>0</reanalysisCount>
+      <keywords>null</keywords>
+      <citationsCountScaled>0.0</citationsCountScaled>
+      <viewsCount>0</viewsCount>
+      <description>Sedimentitalea nanhaiensis DSM 24252 Genome sequencing and assembly</description>
+      <downloadCountScaled>8.20101314054644E-5</downloadCountScaled>
+      <source>omics_ena_project</source>
+      <title>Sedimentitalea nanhaiensis DSM 24252</title>
+      <connectionsCount>14</connectionsCount>
+      <citationsCount>0</citationsCount>
+      <score>null</score>
+      <omicsType>Genomics</omicsType>
+      <reanalysisCountScaled>0.0</reanalysisCountScaled>
+      <organisms>
+        <acc>571166</acc>
+        <name>Sedimentitalea nanhaiensis DSM 24252</name>
+      </organisms>
+      <viewsCountScaled>0.0</viewsCountScaled>
+      <claimable>false</claimable>
+      <id>PRJNA78295</id>
+      <publicationDate>null</publicationDate>
+      <downloadCount>13</downloadCount>
+    </datasets>
+  </oai:metadata>
+  <about xmlns="">
+    <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
+      <originDescription altered="true" harvestDate="2020-10-31T15:31:30.725Z">
+        <baseURL>https%3A%2F%2Fwww.omicsdi.org%2Fws%2Fdataset%2Fsearch</baseURL>
+        <identifier/>
+        <datestamp/>
+        <metadataNamespace/>
+      </originDescription>
+    </provenance>
+    <oaf:datainfo>
+      <oaf:inferred>false</oaf:inferred>
+      <oaf:deletedbyinference>false</oaf:deletedbyinference>
+      <oaf:trust>0.9</oaf:trust>
+      <oaf:inferenceprovenance/>
+      <oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
+        classname="sysimport:crosswalk:datasetarchive"
+        schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
+    </oaf:datainfo>
+  </about>
+</oai:record>
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_REST_OmicsDI.xsl
@ -0,0 +1,297 @@
+<!-- complete literature v4: xslt_cleaning_REST_OmicsDI ; transformation script production , 2021-03-17 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+                version="2.0"
+                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+                xmlns:oaf="http://namespace.openaire.eu/oaf"
+                xmlns:dr="http://www.driver-repository.eu/namespace/dr"
+                xmlns:datacite="http://datacite.org/schema/kernel-4"
+                xmlns:dc="http://purl.org/dc/elements/1.1/"
+                xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+                xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
+                xmlns:xs="http://www.w3.org/2001/XMLSchema"
+                xmlns:vocabulary="http://eu/dnetlib/transform/clean"
+                xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO"
+                xmlns:personCleaner="http://eu/dnetlib/transform/person"
+                exclude-result-prefixes="xsl vocabulary dateCleaner personCleaner">
+
+  <xsl:param name="varOfficialName" />
+  <xsl:param name="varDsType" />
+  <xsl:param name="varDataSourceId" />
+  <xsl:param name="index" select="0"/>
+  <xsl:param name="transDate" select="current-dateTime()"/>
+
+<xsl:variable name="vCodes">
+    <codes>
+         <code source="arrayexpress-repository" id="re3data_____::r3d100010222"  name="ArrayExpress Archive of Functional Genomics Data"  sourceUrl="https://www.ebi.ac.uk/arrayexpress/" urlTemplate="https://www.ebi.ac.uk/arrayexpress/experiments/" />
+         <code source="atlas-experiments"       id="re3data_____::r3d100010223"  name="Expression Atlas Database"                         sourceUrl="http://www.ebi.ac.uk/gxa/home" urlTemplate="" />
+         <code source="biomodels"               id="re3data_____::r3d100010789"  name="BioModels Database"                                sourceUrl="https://www.ebi.ac.uk/biomodels-main/" urlTemplate="" />
+         <code source="dbgap"                   id="re3data_____::r3d100010788"  name="dbGaP (database of Genotypes and Phenotypes)"      sourceUrl="https://www.ncbi.nlm.nih.gov/gap" urlTemplate="" />
+         <code source="ega"                     id="re3data_____::r3d100011242"  name="EGA Database (European Genome-phenome Archive)"    sourceUrl="https://ega-archive.org" urlTemplate="" />
+         <code source="eva"                     id="re3data_____::r3d100011553"  name="EVA database (European Variation Archive)"         sourceUrl="https://www.ebi.ac.uk/eva/" urlTemplate="" />
+         <code source="geo"                     id="re3data_____::r3d100010283"  name="GEO (Gene Expression Omnibus)"                     sourceUrl="https://www.ncbi.nlm.nih.gov/geo/" urlTemplate="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="  />
+         <code source="gnps"                    id="omicsdi_____::gnps"          name="GNPS Database (Global Natural Products Social Molecular Networking)" sourceUrl="https://gnps.ucsd.edu/ProteoSAFe/static/gnps-splash2.jsp" urlTemplate="" />
+         <code source="gpmdb"                   id="re3data_____::r3d100010883"  name="GPMDB (Global Proteome Machine)"                   sourceUrl="http://gpmdb.thegpm.org/" urlTemplate="http://gpmdb.thegpm.org/~/dblist_gpmnum/gpmnum=" />
+         <code source="jpost"                   id="re3data_____::r3d100012349"  name="JPOST Repository (Japan ProteOme STandard Repository/Database)" sourceUrl="https://jpostdb.org/" urlTemplate="https://repository.jpostdb.org/entry/JPST000228" />
+         <code source="lincs"                   id="re3data_____::r3d100011833"  name="LINCS (Big Data to Knowledge / Library of Integrated Network-based Cellular Signatures)" sourceUrl="http://lincsportal.ccs.miami.edu/dcic-portal/" urlTemplate="http://lincsportal.ccs.miami.edu/datasets/#/view/" />
+         <code source="massive"                 id="omicsdi_____::massive"       name="MassIVE Database (Mass Spectrometry Interactive Virtual Environment)" sourceUrl="https://massive.ucsd.edu/ProteoSAFe/datasets.jsp" urlTemplate="" />
+         <code source="metabolights_dataset"    id="opendoar____::2970"          name="MetaboLights Database"                             sourceUrl="http://www.ebi.ac.uk/metabolights/" urlTemplate="" />
+         <code source="metabolome_express"      id="omicsdi_____::metabolome"    name="MetabolomeExpress"                                 sourceUrl="https://www.metabolome-express.org/" urlTemplate="https://www.metabolome-express.org/datasetview.php?datasetid=" />
+         <code source="metabolomics_workbench"  id="re3data_____::r3d100012314"  name="Metabolomics Workbench Database"                   sourceUrl="http://www.metabolomicsworkbench.org/" urlTemplate="http://www.metabolomicsworkbench.org/data/DRCCMetadata.php?StudyID=" />
+         <code source="NCBI"                    id="omicsdi_____::ncbi"          name="NCBI"                                              sourceUrl="https://www.ncbi.nlm.nih.gov/bioproject/" urlTemplate="https://www.ncbi.nlm.nih.gov/bioproject/" />
+         <code source="omics_ena_project"       id="re3data_____::r3d100010527"  name="ENA (European Nucleotide Archive)"                 sourceUrl="https://www.ebi.ac.uk/ena" urlTemplate="https://www.ebi.ac.uk/ena/data/view/" />
+         <code source="paxdb"                   id="omicsdi_____::paxdb"         name="PAXDB (protein abundance database)"                sourceUrl="http://pax-db.org/" urlTemplate="" />
+         <code source="peptide_atlas"           id="re3data_____::r3d100010889"  name="PeptideAtlas Database"                             sourceUrl="http://www.peptideatlas.org/" urlTemplate="" />
+         <code source="pride"                   id="re3data_____::r3d100010137"  name="PRIDE Database (PRoteomics IDEntifications)"       sourceUrl="http://www.ebi.ac.uk/pride/archive/" urlTemplate="http://www.ebi.ac.uk/pride/archive/projects/PXD008134" />
+    </codes>
+ </xsl:variable>
+
+<!--
+gnps, jpost, massive, metabolome_express, paxdb: no id/OpenAIRE-entry found
+ncbi: several OpenAIRE-entries found - is one the right?
+-->
+
+<xsl:key name="kCodeByName" match="code" use="string(@source)"/>
+
+  <xsl:template match="/">
+    <xsl:variable name="datasourcePrefix"
+             select="normalize-space(//oaf:datasourceprefix)" />
+    <xsl:call-template name="validRecord" />
+  </xsl:template>
+
+ <xsl:template name="terminate">
+  	<xsl:message terminate="yes">
+  	record is not compliant, transformation is interrupted.
+  	</xsl:message>
+  </xsl:template>
+
+  <xsl:template name="validRecord">
+    <record>
+     <xsl:apply-templates select="//*[local-name() = 'header']" />
+
+      <metadata>
+
+<!--
+  <xsl:apply-templates select="//*[local-name() = 'metadata']//*[local-name() = 'datasets']"/>
+-->
+
+<datacite:resource>
+
+<!-- OmicsDI does not state: languages, projects, 
+-->
+
+<!-- landing page -->
+<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
+        <datacite:alternateIdentifiers>
+        <datacite:alternateIdentifier>
+                  <xsl:attribute name="alternateIdentifierType">
+                            <xsl:value-of select="'LandingPage'"/>
+                  </xsl:attribute>
+                 <xsl:choose>
+                            <xsl:when test="//*[local-name() = 'source'][. = ('gnps','massive','paxdb','peptide_atlas')]">
+                                      <xsl:value-of select="concat('https://www.omicsdi.org/#/dataset/', //*[local-name() = 'source'], '/', //*[local-name() = 'id'])"/>
+                            </xsl:when>
+                            <xsl:when test="//*[local-name() = 'source'][. = 'metabolome_express']">
+                                      <xsl:value-of select="concat(key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@urlTemplate, substring-after(//*[local-name()='id'], 'MEX'))"/>
+                            </xsl:when>
+                            <xsl:otherwise>
+                                     <xsl:value-of select="concat(key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@urlTemplate, //*[local-name()='id'])"/>
+                            </xsl:otherwise>
+                 </xsl:choose>
+        </datacite:alternateIdentifier>
+        <datacite:alternateIdentifier>
+                  <xsl:attribute name="alternateIdentifierType">
+                            <xsl:value-of select="'local'"/>
+                  </xsl:attribute>
+                  <xsl:value-of select="//*[local-name()='id']"/>
+        </datacite:alternateIdentifier>
+        </datacite:alternateIdentifiers>
+</xsl:if>
+
+<!-- identifier; ... -->
+<!-- URL hindered by _et_: https://www.omicsdi.org/ws/dataset/get_acc=E-MTAB-6546_et_database=arrayexpress-repository -->
+<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
+        <datacite:identifier>
+                  <xsl:attribute name="identifierType">
+                            <xsl:value-of select="'URL'"/>
+                  </xsl:attribute>
+
+                  <xsl:value-of select="concat('https://www.omicsdi.org/dataset/', //*[local-name() = 'source'], '/', //*[local-name() = 'id'])"/>
+
+        </datacite:identifier>
+</xsl:if>
+
+<!-- title -->
+<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
+<datacite:titles>
+        <datacite:title>
+                  <xsl:value-of select="//*[local-name() = 'title']"/>
+        </datacite:title>
+</datacite:titles>
+</xsl:if>
+
+<!-- no authors in OmicsDI -->
+<!--
+<xsl:call-template name="authors" />
+-->
+
+<!--
+<xsl:call-template name="relatedPaper" />
+-->
+
+<datacite:descriptions>
+        <datacite:description>
+                  <xsl:attribute name="descriptionType">
+                            <xsl:value-of select="'Abstract'"/>
+                  </xsl:attribute>
+                  <xsl:value-of select="//*[local-name() = 'description']"/>
+        </datacite:description>
+</datacite:descriptions>
+
+<!-- subject -->
+<datacite:subjects>
+<xsl:for-each select="distinct-values(//*[local-name()='omicsType'])">
+<datacite:subject>
+<xsl:value-of select="."/>
+</datacite:subject>
+</xsl:for-each>
+</datacite:subjects>
+
+</datacite:resource>
+
+                <xsl:choose>
+
+           <xsl:when test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
+                      <xsl:variable name='varCobjCategory'
+                                  select="'0021'" />
+             <dr:CobjCategory>
+             <xsl:attribute name="type">
+                        <xsl:value-of select="vocabulary:clean($varCobjCategory, 'dnet:result_typologies')"/>
+             </xsl:attribute>
+               <xsl:value-of 
+                     select="$varCobjCategory" />
+             </dr:CobjCategory>
+           </xsl:when>
+
+           <xsl:otherwise>
+<!--
+               <xsl:call-template name="terminate"/>
+-->
+           </xsl:otherwise>
+         </xsl:choose>
+<!--
+     // review status: no review indications found so far
+-->
+<!--
+OMICSDI is ‘including both open and controlled data source’.
+-->
+              <oaf:accessrights>
+                 <xsl:text>UNKNOWN</xsl:text>
+              </oaf:accessrights>
+
+<xsl:if test="//*[local-name() = 'datasourceprefix'][.='NeuroVault__']">
+        <oaf:concept>
+                  <xsl:attribute name="id">
+                            <xsl:value-of select="'ni'"/>
+                  </xsl:attribute>
+        </oaf:concept>
+</xsl:if>
+
+         <oaf:hostedBy>
+            <xsl:attribute name="name">
+               <xsl:value-of select="key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@name"/>
+            </xsl:attribute>
+            <xsl:attribute name="id">
+               <xsl:value-of select="key('kCodeByName', string(//*[local-name()='source']), $vCodes)/@id"/>
+            </xsl:attribute>
+         </oaf:hostedBy>
+         <oaf:collectedFrom>
+            <xsl:attribute name="name">
+               <xsl:value-of select="$varOfficialName"/>
+            </xsl:attribute>
+            <xsl:attribute name="id">
+               <xsl:value-of select="$varDataSourceId"/>
+            </xsl:attribute>
+         </oaf:collectedFrom>
+
+<!-- date -->
+<xsl:if test="//*[local-name() = 'datasourceprefix'][.='_____OmicsDI']">
+        <oaf:dateAccepted>
+                  <xsl:value-of select="replace(//*[local-name() = 'publicationDate'][not(.='null')],'(\d{4})(\d{2})(\d{2})','$1-$2-$3')"/>
+        </oaf:dateAccepted>
+</xsl:if>
+
+      </metadata>
+      <xsl:copy-of select="//*[local-name() = 'about']" />
+   </record>
+
+</xsl:template>
+
+<xsl:template match="node()|@*">
+     <xsl:copy>
+       <xsl:apply-templates select="node()|@*"/>
+     </xsl:copy>
+ </xsl:template>
+
+  <xsl:template match="//*[local-name() = 'metadata']//*[local-name() = 'datasets']">
+    <xsl:copy>
+       <xsl:apply-templates select="node()|@*"/>
+     </xsl:copy>
+  </xsl:template>
+
+                       <xsl:template match="//*[local-name() = 'header']">
+<xsl:copy>
+     <xsl:apply-templates  select="node()|@*"/>
+                             <xsl:element name="dr:dateOfTransformation">
+                                 <xsl:value-of select="$transDate"/>
+                             </xsl:element>
+  </xsl:copy>
+                        </xsl:template>
+<!-- 
+no authors findable in OmicsDI">
+-->
+<!-- 
+  <xsl:template match="//*[local-name() = 'authors']">
+-->
+  <xsl:template name="authors">
+<xsl:choose>
+<xsl:when test="not(//*[local-name() = 'authors'][string-length(normalize-space(.)) > 0 and not(. = 'null')])">
+             <xsl:call-template name="terminate" />
+</xsl:when>
+<xsl:otherwise>
+<xsl:for-each select="tokenize(//*[local-name() = 'authors'], '(, and |,| and )')">
+
+    <xsl:element name="datacite:creator">
+
+        <xsl:element name="datacite:creatorName">
+           <xsl:value-of select="personCleaner:normalize( .)"/>
+        </xsl:element>
+
+        <xsl:element name="datacite:givenName">
+           <xsl:value-of select="normalize-space(substring-after(personCleaner:normalize(.), ','))"/>
+        </xsl:element>
+<xsl:element name="datacite:familyName">
+           <xsl:value-of select="substring-before(personCleaner:normalize(.), ',')"/>
+        </xsl:element>
+
+    </xsl:element>
+</xsl:for-each>
+</xsl:otherwise>
+</xsl:choose>
+</xsl:template>
+
+<!--
+  <xsl:template match="//*[local-name() = 'DOI']">
+-->
+<xsl:template name="relatedPaper">
+    <xsl:element name="datacite:relatedIdentifier">
+ <xsl:attribute name="relatedIdentifierType">
+           <xsl:value-of select="'DOI'"/>
+        </xsl:attribute>
+<xsl:attribute name="relationType">
+           <xsl:value-of select="'isReferencedBy'"/>
+        </xsl:attribute>
+<xsl:value-of select="//*[local-name() = 'DOI']"/>
+</xsl:element>
+</xsl:template>
+
+</xsl:stylesheet>
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala
@ -4,16 +4,23 @@ import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.common.ModelConstants
 import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
 import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
-import eu.dnetlib.dhp.schema.orcid.OrcidDOI
+import eu.dnetlib.dhp.schema.orcid.{AuthorData, OrcidDOI}
 import eu.dnetlib.doiboost.DoiBoostMappingUtil
 import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
 import org.apache.commons.lang.StringUtils
 import org.slf4j.{Logger, LoggerFactory}

 import scala.collection.JavaConverters._
+import org.json4s
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST._
+import org.json4s.jackson.JsonMethods._


-case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,errorCode:String){}
+case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){}
+case class OrcidAuthor(oid:String, name:Option[String], surname:Option[String], creditName:Option[String], otherNames:Option[List[String]], errorCode:Option[String]){}
+case class OrcidWork(oid:String, doi:String)
+



@ -46,8 +53,52 @@ object ORCIDToOAF {
  }


-  def convertTOOAF(input:OrcidDOI) :Publication = {
-    val doi = input.getDoi
+  def strValid(s:Option[String]) : Boolean = {
+    s.isDefined && s.get.nonEmpty
+  }
+
+  def authorValid(author:OrcidAuthor): Boolean ={
+    if (strValid(author.name) && strValid(author.surname)) {
+      return true
+    }
+    if (strValid(author.surname)) {
+      return  true
+    }
+    if (strValid(author.creditName)) {
+      return true
+
+    }
+    false
+  }
+
+
+  def extractDOIWorks(input:String): List[OrcidWork] = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: json4s.JValue = parse(input)
+
+    val oid = (json \ "workDetail" \"oid").extract[String]
+    val doi:List[(String, String)] = for {
+      JObject(extIds) <-  json \ "workDetail" \"extIds"
+      JField("type", JString(typeValue)) <- extIds
+      JField("value", JString(value)) <- extIds
+      if "doi".equalsIgnoreCase(typeValue)
+    } yield (typeValue, value)
+    if (doi.nonEmpty) {
+      return doi.map(l =>OrcidWork(oid, l._2))
+    }
+    List()
+  }
+
+  def convertORCIDAuthor(input:String): OrcidAuthor = {
+    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+    lazy val json: json4s.JValue = parse(input)
+
+    (json \"authorData" ).extractOrElse[OrcidAuthor](null)
+   }
+
+
+  def convertTOOAF(input:ORCIDItem) :Publication = {
+    val doi = input.doi
    val pub:Publication = new Publication
    pub.setPid(List(createSP(doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
    pub.setDataInfo(generateDataInfo())
@ -58,8 +109,8 @@ object ORCIDToOAF {

    try{

-      val l:List[Author]= input.getAuthors.asScala.map(a=> {
-        generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid)
+      val l:List[Author]= input.authors.map(a=> {
+        generateAuthor(a)
      })(collection.breakOut)

      pub.setAuthor(l.asJava)
@ -80,16 +131,20 @@ object ORCIDToOAF {
    di
  }

-  def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
+  def generateAuthor(o : OrcidAuthor): Author = {
    val a = new Author
-    a.setName(given)
-    a.setSurname(family)
-    if (fullName!= null && fullName.nonEmpty)
-      a.setFullname(fullName)
-    else
-      a.setFullname(s"$given $family")
-    if (StringUtils.isNotBlank(orcid))
-      a.setPid(List(createSP(orcid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava)
+    if (strValid(o.name)) {
+    a.setName(o.name.get.capitalize)
+    }
+    if (strValid(o.surname)) {
+      a.setSurname(o.surname.get.capitalize)
+    }
+    if(strValid(o.name) && strValid(o.surname))
+      a.setFullname(s"${o.name.get.capitalize} ${o.surname.get.capitalize}")
+    else if (strValid(o.creditName))
+      a.setFullname(o.creditName.get)
+    if (StringUtils.isNotBlank(o.oid))
+      a.setPid(List(createSP(o.oid, ModelConstants.ORCID, ModelConstants.DNET_PID_TYPES, generateOricPIDDatainfo())).asJava)

    a
  }
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala
@ -5,68 +5,48 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.oa.merge.AuthorMerger
 import eu.dnetlib.dhp.schema.oaf.Publication
 import eu.dnetlib.dhp.schema.orcid.OrcidDOI
-import eu.dnetlib.doiboost.mag.ConversionUtil
 import org.apache.commons.io.IOUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
 import org.slf4j.{Logger, LoggerFactory}

 object SparkConvertORCIDToOAF {
  val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)

-  def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
-
-    override def zero: Publication = new Publication()
-
-    override def reduce(b: Publication, a: (String, Publication)): Publication = {
-      b.mergeFrom(a._2)
-      b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
-      if (b.getId == null)
-        b.setId(a._2.getId)
-      b
-    }
-
-
-    override def merge(wx: Publication, wy: Publication): Publication = {
-      wx.mergeFrom(wy)
-      wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
-      if(wx.getId == null && wy.getId.nonEmpty)
-        wx.setId(wy.getId)
-      wx
-    }
-    override def finish(reduction: Publication): Publication = reduction
-
-    override def bufferEncoder: Encoder[Publication] =
-      Encoders.kryo(classOf[Publication])
-
-    override def outputEncoder: Encoder[Publication] =
-      Encoders.kryo(classOf[Publication])
-  }
-
-  def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
+  def run(spark:SparkSession,sourcePath:String,workingPath:String, targetPath:String):Unit = {
+    import spark.implicits._
    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
-    implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
-    implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)

-    val mapper = new ObjectMapper()
-    mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
+    val inputRDD:RDD[OrcidAuthor]  = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s!= null).filter(s => ORCIDToOAF.authorValid(s))

-    val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
+    spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
+
+    val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s!= null)
+
+    spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
+
+    val authors :Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
+
+    val works :Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
+
+    works.joinWith(authors, authors("oid").equalTo(works("oid")))
+      .map(i =>{
+      val doi = i._1.doi
+      val author = i._2
+      (doi, author)
+    }).groupBy(col("_1").alias("doi"))
+      .agg(collect_list(col("_2")).alias("authors"))
+      .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor")
+
+    val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]

    logger.info("Converting ORCID to OAF")
-    dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
-      .map(d => (d.getId, d))
-      .groupByKey(_._1)(Encoders.STRING)
-      .agg(getPublicationAggregator().toColumn)
-      .map(p => p._2)
-      .write.mode(SaveMode.Overwrite).save(targetPath)
+    dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
  }

  def main(args: Array[String]): Unit = {
-
-
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
    parser.parseArgument(args)
@ -78,10 +58,10 @@ object SparkConvertORCIDToOAF {
        .master(parser.get("master")).getOrCreate()


-
    val sourcePath = parser.get("sourcePath")
+    val workingPath = parser.get("workingPath")
    val targetPath = parser.get("targetPath")
-    run(spark, sourcePath, targetPath)
+    run(spark, sourcePath, workingPath, targetPath)

  }

--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json
@ -1,5 +1,6 @@
 [
-  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read",  "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath", "paramDescription": "the path of the Orcid Input file",  "paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workingPath", "paramDescription": "the working path ",  "paramRequired": true},
  {"paramName":"t",   "paramLongName":"targetPath", "paramDescription": "the working dir path",                      "paramRequired": true},
  {"paramName":"m",   "paramLongName":"master",     "paramDescription": "the master name",                          "paramRequired": true}

--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
@ -74,6 +74,11 @@
        <!--    ORCID Parameters    -->
        <property>
            <name>inputPathOrcid</name>
+            <description>the ORCID input path</description>
+        </property>
+
+        <property>
+            <name>workingPathOrcid</name>
            <description>the ORCID working path</description>
        </property>

@ -295,6 +300,7 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
+            <arg>--workingPath</arg><arg>${workingPathOrcid}</arg>
            <arg>--targetPath</arg><arg>${workingPath}/orcidPublication</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala
@ -2,12 +2,14 @@ package eu.dnetlib.doiboost.orcid

 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.oaf.Publication
-import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
-import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test
+import org.junit.jupiter.api.io.TempDir
 import org.slf4j.{Logger, LoggerFactory}

+import java.nio.file.Path
 import scala.io.Source

 class MappingORCIDToOAFTest {
@ -24,27 +26,41 @@ class MappingORCIDToOAFTest {
    })
  }

-//  @Test
-//  def testOAFConvert():Unit ={
-//
-//    val spark: SparkSession =
-//      SparkSession
-//        .builder()
-//        .appName(getClass.getSimpleName)
-//        .master("local[*]").getOrCreate()
-//
-//
-//    SparkConvertORCIDToOAF.run( spark,"/Users/sandro/Downloads/orcid", "/Users/sandro/Downloads/orcid_oaf")
-//    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
-//
-//    val df = spark.read.load("/Users/sandro/Downloads/orcid_oaf").as[Publication]
-//    println(df.first.getId)
-//    println(mapper.writeValueAsString(df.first()))
-//
-//
-//
-//
-//  }
+  @Test
+  def testOAFConvert(@TempDir testDir: Path):Unit ={
+    val sourcePath:String = getClass.getResource("/eu/dnetlib/doiboost/orcid/datasets").getPath
+    val targetPath: String =s"${testDir.toString}/output/orcidPublication"
+    val workingPath =s"${testDir.toString}/wp/"
+
+    val conf = new SparkConf()
+    conf.setMaster("local[*]")
+    conf.set("spark.driver.host", "localhost")
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .appName(getClass.getSimpleName)
+        .config(conf)
+        .getOrCreate()
+    implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
+    import spark.implicits._
+
+    SparkConvertORCIDToOAF.run( spark,sourcePath, workingPath, targetPath)
+
+    val mapper = new ObjectMapper()
+
+
+
+    val oA = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem].count()
+
+
+
+    val p: Dataset[Publication] = spark.read.load(targetPath).as[Publication]
+
+    assertTrue(oA == p.count())
+    println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(p.first()))
+
+
+  }



--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/datasets/authors/result.gz
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/datasets/authors/result.gz
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/datasets/works/part-00000
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcid/datasets/works/part-00000
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java
@ -90,6 +90,7 @@ public class CleanGraphSparkJob {
 			.map((MapFunction<T, T>) value -> fixVocabularyNames(value), Encoders.bean(clazz))
 			.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
 			.map((MapFunction<T, T>) value -> cleanup(value), Encoders.bean(clazz))
+			.filter((FilterFunction<T>) value -> filter(value))
 			.write()
 			.mode(SaveMode.Overwrite)
 			.option("compression", "gzip")
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
@ -67,6 +67,7 @@ public class CleaningFunctionTest {

 		assertNotNull(p_out.getPublisher());
 		assertNull(p_out.getPublisher().getValue());
+
 		assertEquals("und", p_out.getLanguage().getClassid());
 		assertEquals("Undetermined", p_out.getLanguage().getClassname());

@ -120,6 +121,9 @@ public class CleaningFunctionTest {
 				.isPresent());

 		Publication p_cleaned = CleaningFunctions.cleanup(p_out);
+
+		assertEquals(1, p_cleaned.getTitle().size());
+
 		assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
 		assertNull(p_out.getPublisher());

--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json
@ -865,6 +865,28 @@
        "schemename": "dnet:dataCite_title"
      },
      "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
+    },
+    {
+      "dataInfo": {
+        "deletedbyinference": false,
+        "inferenceprovenance": "",
+        "inferred": false,
+        "invisible": false,
+        "provenanceaction": {
+          "classid": "sysimport:crosswalk:datasetarchive",
+          "classname": "sysimport:crosswalk:datasetarchive",
+          "schemeid": "dnet:provenanceActions",
+          "schemename": "dnet:provenanceActions"
+        },
+        "trust": "0.9"
+      },
+      "qualifier": {
+        "classid": "main title",
+        "classname": "main title",
+        "schemeid": "dnet:dataCite_title",
+        "schemename": "dnet:dataCite_title"
+      },
+      "value": "test test 123 test"
    }
  ]
 }