From f9fbb0f26193c8959c8b8cc527c734a60cfdd9f7 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Sat, 24 Jul 2021 16:40:28 +0300
Subject: [PATCH 01/24] added indicators second sprint

---
 .../step16_7-createIndicatorsTables.sql       | 197 +++++++++++++++++-
 1 file changed, 196 insertions(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql
index 8998cb9fca..a2fc88a392 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql
@@ -39,4 +39,199 @@ from publication p
 join result_instance ri on ri.id = p.id
 join datasource on datasource.id = ri.hostedby
 where datasource.id like '%doajarticles%') tmp
-on p.id= tmp.id;
\ No newline at end of file
+on p.id= tmp.id;
+
+create table indi_project_pubs_count stored as parquet as
+select  pr.id id, count(p.id) total_pubs from project_results pr
+join publication p on p.id=pr.result
+group by pr.id
+
+create table indi_project_datasets_count stored as parquet as
+select pr.id id, count(d.id) total_datasets from project_results pr
+join dataset d on d.id=pr.result
+group by pr.id
+
+create table indi_project_software_count stored as parquet as
+select  pr.id id, count(s.id) total_software from project_results pr
+join software s on s.id=pr.result
+group by pr.id
+
+create table indi_project_otherresearch_count stored as parquet as
+select pr.id id, count(o.id) total_other from project_results pr
+join otherresearchproduct o on o.id=pr.result
+group by pr.id
+
+create table indi_pub_avg_year_country_oa stored as parquet as
+select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
+round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
+ from
+ (SELECT year, country, SUM(CASE
+    WHEN bestlicence='Open Access' THEN 1
+ ELSE 0
+ END) AS OpenAccess, SUM(CASE
+ WHEN bestlicence<>'Open Access' THEN 1
+ ELSE 0
+ END) AS NonOpenAccess
+ FROM publication p
+ join result_organization ro on p.id=ro.id
+ join organization o on o.id=ro.organization
+ where cast(year as int)>=2003 and cast(year as int)<=2021
+ group by year, country) tmp
+
+create table indi_dataset_avg_year_country_oa stored as parquet as
+select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
+round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
+ from
+ (SELECT year, country, SUM(CASE
+    WHEN bestlicence='Open Access' THEN 1
+ ELSE 0
+ END) AS OpenAccess, SUM(CASE
+ WHEN bestlicence<>'Open Access' THEN 1
+ ELSE 0
+ END) AS NonOpenAccess
+ FROM dataset d
+ join result_organization ro on d.id=ro.id
+ join organization o on o.id=ro.organization
+ where cast(year as int)>=2003 and cast(year as int)<=2021
+ group by year, country) tmp
+
+create table indi_software_avg_year_country_oa stored as parquet as
+select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
+round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
+ from
+ (SELECT year, country, SUM(CASE
+    WHEN bestlicence='Open Access' THEN 1
+ ELSE 0
+ END) AS OpenAccess, SUM(CASE
+ WHEN bestlicence<>'Open Access' THEN 1
+ ELSE 0
+ END) AS NonOpenAccess
+ FROM software s
+ join result_organization ro on s.id=ro.id
+ join SOURCER.organization o on o.id=ro.organization
+ where cast(year as int)>=2003 and cast(year as int)<=2021
+ group by year, country) tmp
+
+
+create table indi_other_avg_year_country_oa stored as parquet as
+select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
+round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
+ from
+ (SELECT year, country, SUM(CASE
+    WHEN bestlicence='Open Access' THEN 1
+ ELSE 0
+ END) AS OpenAccess, SUM(CASE
+ WHEN bestlicence<>'Open Access' THEN 1
+ ELSE 0
+ END) AS NonOpenAccess
+ FROM otherresearchproduct orp
+ join result_organization ro on orp.id=ro.id
+ join organization o on o.id=ro.organization
+ where cast(year as int)>=2003 and cast(year as int)<=2021
+ group by year, country) tmp
+
+create table indi_pub_avg_year_context_oa stored as parquet as
+with total as
+(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc
+join context c on pc.concept like concat('%',c.id,'%')
+join publication p on p.id=pc.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by c.name, year )
+select year, name, round(no_of_pubs/total*100,3) averageofpubs
+from total
+
+create table indi_dataset_avg_year_context_oa stored as parquet as
+with total as
+(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc
+join context c on pc.concept like concat('%',c.id,'%')
+join dataset p on p.id=pc.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by c.name, year )
+select year, name, round(no_of_pubs/total*100,3) averageofdataset
+from total
+
+create table indi_software_avg_year_context_oa stored as parquet as
+with total as
+(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc
+join context c on pc.concept like concat('%',c.id,'%')
+join software p on p.id=pc.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by c.name, year )
+select year, name, round(no_of_pubs/total*100,3) averageofsoftware
+from total
+
+create table indi_other_avg_year_context_oa stored as parquet as
+with total as
+(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc
+join context c on pc.concept like concat('%',c.id,'%')
+join otherresearchproduct p on p.id=pc.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by c.name, year )
+select year, name, round(no_of_pubs/total*100,3) averageofother
+from total
+
+create table indi_other_avg_year_content_oa stored as parquet as
+with total as
+(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
+from otherresearchproduct_datasources pd
+join datasource d on datasource=d.id
+join otherresearchproduct p on p.id=pd.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by d.type, year)
+select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct
+from total
+
+create table indi_software_avg_year_content_oa stored as parquet as
+with total as
+(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
+from software_datasources pd
+join datasource d on datasource=d.id
+join software p on p.id=pd.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by d.type, year)
+select year, type, round(no_of_pubs/total*100,3) averageOfSoftware
+from total
+
+create table indi_dataset_avg_year_content_oa stored as parquet as
+with total as
+(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
+from dataset_datasources pd
+join datasource d on datasource=d.id
+join dataset p on p.id=pd.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by d.type, year)
+select year, type, round(no_of_pubs/total*100,3) averageOfDatasets
+from total
+
+create table indi_pub_avg_year_content_oa stored as parquet as
+with total as
+(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total
+from publication_datasources pd
+join datasource d on datasource=d.id
+join publication p on p.id=pd.id
+where cast(year as int)>=2003 and cast(year as int)<=2021
+group by d.type, year)
+select year, type, round(no_of_pubs/total*100,3) averageOfPubs
+from total
+
+create table indi_pub_has_cc_licence stored as parquet as
+select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
+from publication p
+left outer join (select p.id, license.type as lic from publication p
+join publication_licenses as license on license.id = p.id
+where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
+on p.id= tmp.id
+
+create table indi_pub_has_cc_licence_url stored as parquet as
+select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url
+from publication p
+left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host
+from publication p
+join publication_licenses as license on license.id = p.id
+WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp
+on p.id= tmp.id
+
+
+create table indi_pub_has_abstract stored as parquet as
+select distinct publication.id, coalesce(abstract, 1) has_abstract
+from publication
\ No newline at end of file

From f3b9570354bd170511c56e9995ac6188601add56 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Mon, 26 Jul 2021 13:00:16 +0300
Subject: [PATCH 02/24] properly invalidating metadata

---
 .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh
index d5aa207d19..fb944f4ffb 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh
@@ -13,7 +13,7 @@ echo "Getting file from " $SCRIPT_PATH
 hdfs dfs -copyToLocal $SCRIPT_PATH
 
 echo "Creating indicators"
-impala-shell -d ${TARGET} -q "invalidate metadata"
+impala-shell -q "invalidate metadata"
 impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f -
 cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f -
 echo "Indicators created"
\ No newline at end of file

From ed185fd7ed479e385904eb4b8edc4fe821844f5c Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Tue, 27 Jul 2021 11:42:47 +0300
Subject: [PATCH 03/24] added missing colons

---
 .../step16_7-createIndicatorsTables.sql       | 39 +++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql
index a2fc88a392..f1ebf0d87f 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql
@@ -44,22 +44,22 @@ on p.id= tmp.id;
 create table indi_project_pubs_count stored as parquet as
 select  pr.id id, count(p.id) total_pubs from project_results pr
 join publication p on p.id=pr.result
-group by pr.id
+group by pr.id;
 
 create table indi_project_datasets_count stored as parquet as
 select pr.id id, count(d.id) total_datasets from project_results pr
 join dataset d on d.id=pr.result
-group by pr.id
+group by pr.id;
 
 create table indi_project_software_count stored as parquet as
 select  pr.id id, count(s.id) total_software from project_results pr
 join software s on s.id=pr.result
-group by pr.id
+group by pr.id;
 
 create table indi_project_otherresearch_count stored as parquet as
 select pr.id id, count(o.id) total_other from project_results pr
 join otherresearchproduct o on o.id=pr.result
-group by pr.id
+group by pr.id;
 
 create table indi_pub_avg_year_country_oa stored as parquet as
 select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
@@ -76,7 +76,7 @@ round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
  join result_organization ro on p.id=ro.id
  join organization o on o.id=ro.organization
  where cast(year as int)>=2003 and cast(year as int)<=2021
- group by year, country) tmp
+ group by year, country) tmp;
 
 create table indi_dataset_avg_year_country_oa stored as parquet as
 select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
@@ -93,7 +93,7 @@ round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
  join result_organization ro on d.id=ro.id
  join organization o on o.id=ro.organization
  where cast(year as int)>=2003 and cast(year as int)<=2021
- group by year, country) tmp
+ group by year, country) tmp;
 
 create table indi_software_avg_year_country_oa stored as parquet as
 select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA,
@@ -110,7 +110,7 @@ round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
  join result_organization ro on s.id=ro.id
  join SOURCER.organization o on o.id=ro.organization
  where cast(year as int)>=2003 and cast(year as int)<=2021
- group by year, country) tmp
+ group by year, country) tmp;
 
 
 create table indi_other_avg_year_country_oa stored as parquet as
@@ -128,7 +128,7 @@ round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA
  join result_organization ro on orp.id=ro.id
  join organization o on o.id=ro.organization
  where cast(year as int)>=2003 and cast(year as int)<=2021
- group by year, country) tmp
+ group by year, country) tmp;
 
 create table indi_pub_avg_year_context_oa stored as parquet as
 with total as
@@ -138,7 +138,7 @@ join publication p on p.id=pc.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by c.name, year )
 select year, name, round(no_of_pubs/total*100,3) averageofpubs
-from total
+from total;
 
 create table indi_dataset_avg_year_context_oa stored as parquet as
 with total as
@@ -148,7 +148,7 @@ join dataset p on p.id=pc.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by c.name, year )
 select year, name, round(no_of_pubs/total*100,3) averageofdataset
-from total
+from total;
 
 create table indi_software_avg_year_context_oa stored as parquet as
 with total as
@@ -158,7 +158,7 @@ join software p on p.id=pc.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by c.name, year )
 select year, name, round(no_of_pubs/total*100,3) averageofsoftware
-from total
+from total;
 
 create table indi_other_avg_year_context_oa stored as parquet as
 with total as
@@ -168,7 +168,7 @@ join otherresearchproduct p on p.id=pc.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by c.name, year )
 select year, name, round(no_of_pubs/total*100,3) averageofother
-from total
+from total;
 
 create table indi_other_avg_year_content_oa stored as parquet as
 with total as
@@ -179,7 +179,7 @@ join otherresearchproduct p on p.id=pd.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by d.type, year)
 select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct
-from total
+from total;
 
 create table indi_software_avg_year_content_oa stored as parquet as
 with total as
@@ -190,7 +190,7 @@ join software p on p.id=pd.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by d.type, year)
 select year, type, round(no_of_pubs/total*100,3) averageOfSoftware
-from total
+from total;
 
 create table indi_dataset_avg_year_content_oa stored as parquet as
 with total as
@@ -201,7 +201,7 @@ join dataset p on p.id=pd.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by d.type, year)
 select year, type, round(no_of_pubs/total*100,3) averageOfDatasets
-from total
+from total;
 
 create table indi_pub_avg_year_content_oa stored as parquet as
 with total as
@@ -212,7 +212,7 @@ join publication p on p.id=pd.id
 where cast(year as int)>=2003 and cast(year as int)<=2021
 group by d.type, year)
 select year, type, round(no_of_pubs/total*100,3) averageOfPubs
-from total
+from total;
 
 create table indi_pub_has_cc_licence stored as parquet as
 select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
@@ -220,7 +220,7 @@ from publication p
 left outer join (select p.id, license.type as lic from publication p
 join publication_licenses as license on license.id = p.id
 where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
-on p.id= tmp.id
+on p.id= tmp.id;
 
 create table indi_pub_has_cc_licence_url stored as parquet as
 select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url
@@ -229,9 +229,8 @@ left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host
 from publication p
 join publication_licenses as license on license.id = p.id
 WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp
-on p.id= tmp.id
-
+on p.id= tmp.id;
 
 create table indi_pub_has_abstract stored as parquet as
 select distinct publication.id, coalesce(abstract, 1) has_abstract
-from publication
\ No newline at end of file
+from publication;
\ No newline at end of file

From 1a28a69cac1031bda96929c5a6512f52d8fdda2d Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Tue, 27 Jul 2021 15:14:09 +0300
Subject: [PATCH 04/24] changed the citeee in *_citations to cites

---
 .../graph/stats/oozie_app/scripts/step2.sql   | 23 ++-----------------
 .../graph/stats/oozie_app/scripts/step3.sql   | 21 ++---------------
 .../graph/stats/oozie_app/scripts/step4.sql   | 21 ++---------------
 .../graph/stats/oozie_app/scripts/step5.sql   | 21 ++---------------
 4 files changed, 8 insertions(+), 78 deletions(-)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
index 75b24b1893..bb0d0ac6ca 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
@@ -90,27 +90,8 @@ FROM ${openaire_db_name}.publication p
 where p.datainfo.deletedbyinference = false;
 
 CREATE TABLE ${stats_db_name}.publication_citations AS
-SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result
+SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
 FROM ${openaire_db_name}.publication p
          lateral view explode(p.extrainfo) citations AS citation
 WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
-  and p.datainfo.deletedbyinference = false;
-
--- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS FOR COLUMNS;
\ No newline at end of file
+  and p.datainfo.deletedbyinference = false;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
index 540cc03a51..953eaad6a9 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql
@@ -41,7 +41,7 @@ FROM ${openaire_db_name}.dataset d
 WHERE d.datainfo.deletedbyinference = FALSE;
 
 CREATE TABLE ${stats_db_name}.dataset_citations AS
-SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result
+SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
 FROM ${openaire_db_name}.dataset d
          LATERAL VIEW explode(d.extrainfo) citations AS citation
 WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
@@ -95,21 +95,4 @@ CREATE TABLE ${stats_db_name}.dataset_topics AS
 SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
 FROM ${openaire_db_name}.dataset p
          LATERAL VIEW explode(p.subject) subjects AS subject
-where p.datainfo.deletedbyinference = false;
---
--- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS FOR COLUMNS;
\ No newline at end of file
+where p.datainfo.deletedbyinference = false;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
index 54345e0741..0210dc8cb9 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql
@@ -41,7 +41,7 @@ from ${openaire_db_name}.software s
 where s.datainfo.deletedbyinference = false;
 
 CREATE TABLE ${stats_db_name}.software_citations AS
-SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT
+SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
 FROM ${openaire_db_name}.software s
          LATERAL VIEW explode(s.extrainfo) citations as citation
 where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
@@ -95,21 +95,4 @@ CREATE TABLE ${stats_db_name}.software_topics AS
 SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
 FROM ${openaire_db_name}.software p
          LATERAL VIEW explode(p.subject) subjects AS subject
-where p.datainfo.deletedbyinference = false;
---
--- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS FOR COLUMNS;
\ No newline at end of file
+where p.datainfo.deletedbyinference = false;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
index 36ad5d92a8..f7b302186f 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql
@@ -41,7 +41,7 @@ WHERE o.datainfo.deletedbyinference = FALSE;
 
 -- Otherresearchproduct_citations
 CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS
-SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT
+SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
 FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
 WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
   and o.datainfo.deletedbyinference = false;
@@ -86,21 +86,4 @@ where p.datainfo.deletedbyinference = false;
 CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS
 SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
 FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
-where p.datainfo.deletedbyinference = false;
-
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS FOR COLUMNS;
\ No newline at end of file
+where p.datainfo.deletedbyinference = false;
\ No newline at end of file

From 43e62fcae92b9c955201d0b2f493d22cc8b32744 Mon Sep 17 00:00:00 2001
From: "miriam.baglioni" <miriam.baglioni@isti.cnr.it>
Date: Wed, 28 Jul 2021 11:04:55 +0200
Subject: [PATCH 05/24] DoiBoost AccessRigh #4362 - related to
 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/126/files#issuecomment-4193

---
 .../dnetlib/doiboost/DoiBoostMappingUtil.scala | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
index 686a2f1f15..d018948fc2 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@@ -179,20 +179,6 @@ object DoiBoostMappingUtil {
 
       }
 
-      //val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
-
-
-
-     // val pub_date = LocalDate.parse(date, formatter)
-
-//      if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){
-//        val oaq : AccessRight = getOpenAccessQualifier()
-//        oaq.setOpenAccessRoute(OpenAccessRoute.hybrid)
-//        return oaq
-//      }
-//      else{
-//        return getEmbargoedAccessQualifier()
-//      }
     }
 
     return getClosedAccessQualifier()
@@ -206,12 +192,12 @@ object DoiBoostMappingUtil {
   }
 
   def getRestrictedQualifier():AccessRight = {
-    OafMapperUtils.accessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+    OafMapperUtils.accessRight( "RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
   }
 
 
   def getUnknownQualifier():AccessRight = {
-    OafMapperUtils.accessRight("UNKNOWN","not available",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+    OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
   }
 
 

From 5fe016dcbccda43276342efd1547276d723ff5db Mon Sep 17 00:00:00 2001
From: "miriam.baglioni" <miriam.baglioni@isti.cnr.it>
Date: Wed, 28 Jul 2021 11:14:28 +0200
Subject: [PATCH 06/24] DoiBoost AccessRigh #4362 - related to
 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/126/files#issuecomment-4194

---
 .../main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
index d018948fc2..ea65fc747b 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@@ -188,7 +188,8 @@ object DoiBoostMappingUtil {
 
 
   def getOpenAccessQualifier():AccessRight = {
-    OafMapperUtils.accessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
+
+    OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
   }
 
   def getRestrictedQualifier():AccessRight = {

From 80d5b3b4deb5098ba85dd701a35cfecf82b28ef9 Mon Sep 17 00:00:00 2001
From: "miriam.baglioni" <miriam.baglioni@isti.cnr.it>
Date: Wed, 28 Jul 2021 11:16:49 +0200
Subject: [PATCH 07/24] DoiBoost AccessRigh #4362 - removing commented code

---
 .../eu/dnetlib/doiboost/DoiBoostMappingUtil.scala | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
index ea65fc747b..e688804337 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala
@@ -236,8 +236,7 @@ object DoiBoostMappingUtil {
           i.setAccessright(getOpenAccessQualifier())
           i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
         }
-//        val ar = getOpenAccessQualifier()
-//        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
+
       }
       else {
         hb = ModelConstants.UNKNOWN_REPOSITORY
@@ -246,17 +245,7 @@ object DoiBoostMappingUtil {
     })
 
     publication.setBestaccessright(OafMapperUtils.createBestAccessRights(publication.getInstance()))
-//    val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
-//    if (ar.nonEmpty) {
-//      if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){
-//        val ar = getOpenAccessQualifier()
-//        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
-//      }
-//      else {
-//        val ar = getRestrictedQualifier()
-//        publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename))
-//      }
-//    }
+
     publication
   }
 

From 4a9741825d3afd1792ebd8881ba82dde53962328 Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Wed, 28 Jul 2021 12:28:04 +0300
Subject: [PATCH 08/24] added result_orcid, result_project provenance, issn in
 datasources

---
 .../graph/stats/oozie_app/scripts/step13.sql  | 20 ++++++++++---------
 .../graph/stats/oozie_app/scripts/step15.sql  | 11 +---------
 .../graph/stats/oozie_app/scripts/step6.sql   |  2 +-
 .../graph/stats/oozie_app/scripts/step7.sql   |  9 ++-------
 .../graph/stats/oozie_app/scripts/step8.sql   | 19 +++++++-----------
 5 files changed, 22 insertions(+), 39 deletions(-)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
index d79396b3bc..e4e81175cc 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql
@@ -57,12 +57,14 @@ UNION ALL
 SELECT * FROM ${stats_db_name}.software_sources
 UNION ALL
 SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
---
--- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS FOR COLUMNS;
\ No newline at end of file
+
+
+create table ${stats_db_name}.result_orcid as
+select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
+from (
+    SELECT substr(res.id, 4) as id, auth_pid.value as orcid
+    FROM ${openaire_db_name}.result res
+    LATERAL VIEW explode(author) a as auth
+    LATERAL VIEW explode(auth.pid) ap as auth_pid
+    LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
+    WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
index 8f364d7478..8e66e05c02 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
@@ -33,13 +33,4 @@ select * from ${stats_db_name}.dataset_refereed
 union all
 select * from ${stats_db_name}.software_refereed
 union all
-select * from ${stats_db_name}.otherresearchproduct_refereed;
---
--- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS FOR COLUMNS;
\ No newline at end of file
+select * from ${stats_db_name}.otherresearchproduct_refereed;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
index 5d81e97bb9..4cbdba9317 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
@@ -13,7 +13,7 @@ WHERE r.reltype = 'projectOrganization'
   and r.datainfo.deletedbyinference = false;
 
 CREATE TABLE ${stats_db_name}.project_results AS
-SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result
+SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
 FROM ${openaire_db_name}.relation r
 WHERE r.reltype = 'resultProject'
   and r.datainfo.deletedbyinference = false;
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
index ae540b9b23..b3cbc9b419 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
@@ -130,12 +130,7 @@ WHERE r.reltype = 'resultOrganization'
   and r.datainfo.deletedbyinference = false;
 
 CREATE TABLE ${stats_db_name}.result_projects AS
-select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend
+select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
 FROM ${stats_db_name}.result r
          JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
-         JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;
-
--- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS FOR COLUMNS;
\ No newline at end of file
+         JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
index de0fedd7e2..5d770dd617 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
@@ -17,7 +17,9 @@ CREATE TABLE ${stats_db_name}.datasource_tmp
     `latitude`         STRING,
     `longitude`        STRING,
     `websiteurl`       STRING,
-    `compatibility`    STRING
+    `compatibility`    STRING,
+    issn_printed       STRING,
+    issn_online        STRING
 ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
 
 -- Insert statement that takes into account the piwik_id of the openAIRE graph
@@ -32,7 +34,9 @@ SELECT substr(d1.id, 4)                                          AS id,
        d1.latitude.value                                         AS latitude,
        d1.longitude.value                                        AS longitude,
        d1.websiteurl.value                                       AS websiteurl,
-       d1.openairecompatibility.classid                          AS compatibility
+       d1.openairecompatibility.classid                          AS compatibility,
+       d1.journal.issnprinted                                    AS issn_printed,
+       d1.journal.issnonline                                    AS issn_online
 FROM ${openaire_db_name}.datasource d1
          LEFT OUTER JOIN
      (SELECT id, split(originalidd, '\\:')[1] as piwik_id
@@ -97,13 +101,4 @@ where d.datainfo.deletedbyinference = false;
 
 CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
 SELECT datasource AS id, id AS result
-FROM ${stats_db_name}.result_datasources;
-
--- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS FOR COLUMNS;
--- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS;
--- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS FOR COLUMNS;
\ No newline at end of file
+FROM ${stats_db_name}.result_datasources;
\ No newline at end of file

From 2fff24df55f2bafd2b0f67837d2f266dcf934fa8 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Wed, 28 Jul 2021 11:34:19 +0200
Subject: [PATCH 09/24] code formatting

---
 .../dhp/oa/dedup/GroupEntitiesSparkJob.java   |  3 +-
 .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 28 ++++++++++---------
 .../oa/provision/utils/XmlRecordFactory.java  |  2 +-
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java
index 3f27b94422..58009bfcfc 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/GroupEntitiesSparkJob.java
@@ -38,8 +38,7 @@ import scala.Tuple2;
 /**
  * Groups the graph content by entity identifier to ensure ID uniqueness
  */
-public class
-GroupEntitiesSparkJob {
+public class GroupEntitiesSparkJob {
 
 	private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class);
 
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
index c41a6c68c0..63f18a803a 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@@ -1,13 +1,13 @@
 
 package eu.dnetlib.dhp.oa.graph.raw;
 
-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
-import eu.dnetlib.dhp.oa.graph.clean.GraphCleaningFunctionsTest;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.oaf.utils.PidType;
-import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import static org.junit.jupiter.api.Assertions.*;
+import static org.mockito.Mockito.lenient;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Optional;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.junit.jupiter.api.BeforeEach;
@@ -16,12 +16,14 @@ import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.Mock;
 import org.mockito.junit.jupiter.MockitoExtension;
 
-import java.io.IOException;
-import java.util.List;
-import java.util.Optional;
+import com.fasterxml.jackson.databind.ObjectMapper;
 
-import static org.junit.jupiter.api.Assertions.*;
-import static org.mockito.Mockito.lenient;
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
+import eu.dnetlib.dhp.oa.graph.clean.GraphCleaningFunctionsTest;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.utils.PidType;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
 
 @ExtendWith(MockitoExtension.class)
 public class MappersTest {
@@ -340,7 +342,7 @@ public class MappersTest {
 		assertEquals(2, p.getOriginalId().size());
 
 		assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:pub.uni-bielefeld.de:2949739")));
-		//assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
+		// assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
 
 		assertValidId(p.getCollectedfrom().get(0).getKey());
 		assertTrue(p.getAuthor().size() > 0);
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
index a985d23718..2c82402900 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java
@@ -16,7 +16,6 @@ import javax.xml.transform.*;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 
-import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.util.LongAccumulator;
 import org.dom4j.Document;
@@ -43,6 +42,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
 
 public class XmlRecordFactory implements Serializable {
 

From c806387d4bfa74491375afbc80c39d67547ce9f6 Mon Sep 17 00:00:00 2001
From: Alessia Bardi <alessia.bardi@isti.cnr.it>
Date: Tue, 20 Jul 2021 19:31:43 +0200
Subject: [PATCH 10/24] tests for enermaps

---
 .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 25 +++++++
 .../eu/dnetlib/dhp/oa/graph/raw/enermaps.xml  | 72 +++++++++++++++++++
 .../oa/provision/XmlRecordFactoryTest.java    | 29 ++++++++
 .../eu/dnetlib/dhp/oa/provision/enermaps.json |  1 +
 4 files changed, 127 insertions(+)
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml
 create mode 100644 dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json

diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
index 63f18a803a..fb4a5b5da9 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@@ -559,6 +559,31 @@ public class MappersTest {
 		assertNotNull(d.getInstance().get(0).getUrl());
 	}
 
+	@Test
+	void testEnermaps() throws IOException {
+		final String xml = IOUtils.toString(getClass().getResourceAsStream("enermaps.xml"));
+		final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
+
+		System.out.println("***************");
+		System.out.println(new ObjectMapper().writeValueAsString(list));
+		System.out.println("***************");
+
+		assertEquals(1, list.size());
+		assertTrue(list.get(0) instanceof Dataset);
+
+		final Dataset d = (Dataset) list.get(0);
+
+		assertValidId(d.getId());
+		assertValidId(d.getCollectedfrom().get(0).getKey());
+		assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
+		assertEquals(1, d.getAuthor().size());
+		assertEquals(1, d.getInstance().size());
+		assertNotNull(d.getInstance().get(0).getUrl());
+		assertNotNull(d.getContext());
+		assertTrue(StringUtils.isNotBlank(d.getContext().get(0).getId()));
+		assertEquals("enermaps::selection::tgs00004", d.getContext().get(0).getId());
+	}
+
 	@Test
 	void testClaimFromCrossref() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml"));
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml
new file mode 100644
index 0000000000..362b40c85c
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<record xmlns="http://datacite.org/schema/kernel-4"
+        xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:oaf="http://namespace.openaire.eu/oaf">
+    <oai:header xmlns="http://namespace.openaire.eu/"
+                xmlns:dc="http://purl.org/dc/elements/1.1/"
+                xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+                xmlns:oai="http://www.openarchives.org/OAI/2.0/"
+                xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        <dri:objIdentifier>enermaps____::04149ee428d07360314c2cb3ba95d41e</dri:objIdentifier>
+        <dri:recordIdentifier>tgs00004</dri:recordIdentifier>
+        <dri:dateOfCollection>2021-07-20T18:43:12.096+02:00</dri:dateOfCollection>
+        <oaf:datasourceprefix>enermaps____</oaf:datasourceprefix>
+    </oai:header>
+    <metadata>
+        <resource>
+            <identifier identifierType="URL">https://ec.europa.eu/eurostat/web/products-datasets/-/tgs00004</identifier>
+            <creators>
+                <creator>
+                    <creatorName>Statistical Office of the European Union (Eurostat)</creatorName>
+                </creator>
+            </creators>
+            <titles>
+                <title>
+                    Regional GDP
+                </title>
+            </titles>
+            <publisher>Statistical Office of the European Union (Eurostat)</publisher>
+            <publicationYear>2020</publicationYear>
+            <dates>
+                <date dateType="Issued">2020-10-07</date>
+            </dates>
+            <resourceType resourceTypeGeneral="Dataset"/>
+            <rightsList>
+                <rights rightsURI="info:eu-repo/semantics/openAccess">OPEN</rights>
+                <rights rightsURI="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</rights>
+            </rightsList>
+            <descriptions>
+                <description descriptionType="Abstract" xml:lang="EN">GDP expressed in PPS (purchasing power standards) eliminates differences in price levels between countries. Calculations on a per inhabitant basis allow for the comparison of economies and regions significantly different in absolute size. GDP per inhabitant in PPS is the key variable for determining the eligibility of NUTS 2 regions in the framework of the European Unions structural policy.</description>
+            </descriptions>
+            <dr:CobjCategory type="dataset">0021</dr:CobjCategory>
+            <oaf:dateAccepted>2020-10-07</oaf:dateAccepted>
+            <oaf:accessrights>OPEN</oaf:accessrights>
+            <oaf:license>Creative Commons Attribution 4.0 International</oaf:license>
+            <oaf:hostedBy
+                    id="openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18" name="Unknown Repository"/>
+            <oaf:collectedFrom id="enermaps____::db" name="Enermaps"/>
+            <oaf:concept id="enermaps::selection::tgs00004"/>
+        </resource>
+    </metadata>
+    <about xmlns="" xmlns:dc="http://purl.org/dc/elements/1.1/"
+           xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+           xmlns:oai="http://www.openarchives.org/OAI/2.0/"
+           xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+        <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
+            <originDescription altered="true" harvestDate="2021-07-20T18:43:12.096+02:00">
+                <baseURL>https%3A%2F%2Flab.idiap.ch%2Fenermaps%2Fapi%2Fdatacite</baseURL>
+                <identifier/>
+                <datestamp/>
+                <metadataNamespace/>
+            </originDescription>
+        </provenance>
+        <oaf:datainfo>
+            <oaf:inferred>false</oaf:inferred>
+            <oaf:deletedbyinference>false</oaf:deletedbyinference>
+            <oaf:trust>0.9</oaf:trust>
+            <oaf:inferenceprovenance/>
+            <oaf:provenanceaction classid="sysimport:crosswalk"
+                                  classname="sysimport:crosswalk"
+                                  schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
+        </oaf:datainfo>
+    </about>
+</record>
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
index 6631cb4da6..a5a1563aa0 100644
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
@@ -7,6 +7,8 @@ import java.io.IOException;
 import java.io.StringReader;
 import java.util.List;
 
+import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
 import org.apache.commons.io.IOUtils;
 import org.dom4j.Document;
 import org.dom4j.DocumentException;
@@ -131,4 +133,31 @@ public class XmlRecordFactoryTest {
 		System.out.println(doc.asXML());
 		assertEquals("", doc.valueOf("//rel/validated"));
 	}
+
+	@Test
+	public void testEnermapsRecord() throws IOException, DocumentException {
+
+		String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>" +
+				"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>"+
+				"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>"+
+				"</entries>";
+
+		ContextMapper contextMapper = ContextMapper.fromXml(contextmap);
+		XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
+				otherDsTypeId);
+
+		Dataset d = OBJECT_MAPPER
+				.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);
+
+		JoinedEntity je = new JoinedEntity<>(d);
+
+		String xml = xmlRecordFactory.build(je);
+
+		assertNotNull(xml);
+
+		Document doc = new SAXReader().read(new StringReader(xml));
+		assertNotNull(doc);
+		System.out.println(doc.asXML());
+		assertEquals("enermaps::selection::tgs00004", doc.valueOf("//concept/@id"));
+	}
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json
new file mode 100644
index 0000000000..dcd4c2ee17
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json
@@ -0,0 +1 @@
+{"collectedfrom":[{"key":"10|enermaps____::d77d5e503ad1439f585ac494268b351b","value":"Enermaps","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626800904248,"id":"50|enermaps____::04149ee428d07360314c2cb3ba95d41e","originalId":["50|enermaps____::04149ee428d07360314c2cb3ba95d41e","tgs00004"],"pid":[],"dateofcollection":"2021-07-20T18:43:12.096+02:00","dateoftransformation":"","extraInfo":[],"oaiprovenance":{"originDescription":{"harvestDate":"2021-07-20T18:43:12.096+02:00","altered":true,"baseURL":"https%3A%2F%2Flab.idiap.ch%2Fenermaps%2Fapi%2Fdatacite","identifier":"","datestamp":"","metadataNamespace":""}},"measures":null,"author":[{"fullname":"Statistical Office of the European Union (Eurostat)","name":"","surname":"","rank":1,"pid":[],"affiliation":[]}],"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[],"title":[{"value":"\n                    Regional GDP\n                ","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[{"value":"2020-10-07","qualifier":{"classid":"Issued","classname":"Issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"description":[{"value":"GDP expressed in PPS (purchasing power standards) eliminates differences in price levels between countries. Calculations on a per inhabitant basis allow for the comparison of economies and regions significantly different in absolute size. GDP per inhabitant in PPS is the key variable for determining the eligibility of NUTS 2 regions in the framework of the European Unions structural policy.","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofacceptance":{"value":"2020-10-07","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":{"value":"Statistical Office of the European Union (Eurostat)","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"coverage":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[{"id":"enermaps::selection::tgs00004","dataInfo":[{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}]}],"externalReference":[],"instance":[{"license":{"value":"Creative Commons Attribution 4.0 International","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes","openAccessRoute":null},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|openaire____::55045bd2a65019fd8e6741a755395c8c","value":"Unknown Repository","dataInfo":null},"url":["https://ec.europa.eu/eurostat/web/products-datasets/-/tgs00004"],"distributionlocation":null,"collectedfrom":{"key":"10|enermaps____::d77d5e503ad1439f585ac494268b351b","value":"Enermaps","dataInfo":null},"pid":[],"alternateIdentifier":[],"dateofacceptance":{"value":"2020-10-07","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"}}],"storagedate":{"value":"2020-10-07","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"device":null,"size":null,"version":null,"lastmetadataupdate":null,"metadataversionnumber":null,"geolocation":[]}

From df8715a1ecc57b2221a960526700f53d1f6cb676 Mon Sep 17 00:00:00 2001
From: Alessia Bardi <alessia.bardi@isti.cnr.it>
Date: Wed, 28 Jul 2021 11:58:26 +0200
Subject: [PATCH 11/24] format code after mvn compile

---
 .../dhp/oa/provision/XmlRecordFactoryTest.java  | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
index a5a1563aa0..221049f903 100644
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java
@@ -7,8 +7,6 @@ import java.io.IOException;
 import java.io.StringReader;
 import java.util.List;
 
-import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
-import eu.dnetlib.dhp.schema.oaf.Dataset;
 import org.apache.commons.io.IOUtils;
 import org.dom4j.Document;
 import org.dom4j.DocumentException;
@@ -23,8 +21,10 @@ import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
 import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
 import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
+import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
 import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
 import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
 import eu.dnetlib.dhp.schema.oaf.Project;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Relation;
@@ -137,17 +137,18 @@ public class XmlRecordFactoryTest {
 	@Test
 	public void testEnermapsRecord() throws IOException, DocumentException {
 
-		String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>" +
-				"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>"+
-				"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>"+
-				"</entries>";
+		String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>"
+			+
+			"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>" +
+			"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>" +
+			"</entries>";
 
 		ContextMapper contextMapper = ContextMapper.fromXml(contextmap);
 		XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
-				otherDsTypeId);
+			otherDsTypeId);
 
 		Dataset d = OBJECT_MAPPER
-				.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);
+			.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);
 
 		JoinedEntity je = new JoinedEntity<>(d);
 

From 9b181ffa73c6770a97caa9218c747e4d392d98ec Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Wed, 28 Jul 2021 16:31:29 +0300
Subject: [PATCH 12/24] added the h2020 classification scheme for projects

---
 .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
index 4cbdba9317..0c4a767a4e 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
@@ -18,6 +18,12 @@ FROM ${openaire_db_name}.relation r
 WHERE r.reltype = 'resultProject'
   and r.datainfo.deletedbyinference = false;
 
+create table ${stats_db_name}.project_classification as
+select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
+from ${openaire_db_name}project p
+    lateral view explode(p.h2020classification) classifs as class
+where p.datainfo.deletedbyinference=false and class.h2020programme is not null;
+
 CREATE TABLE ${stats_db_name}.project_tmp
 (
     id             STRING,

From 3d8f0f629b3ff2c9d2b7401b7e62381f1547531c Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Wed, 28 Jul 2021 16:15:15 +0200
Subject: [PATCH 13/24] implemented workflow of creation action set for
 scholexplorer

---
 .../datacite/AbstractRestClient.scala         | 32 ++++++++---------
 dhp-workflows/dhp-graph-provision/pom.xml     | 35 +++++++++++++++++++
 .../sx/provision/SparkCreateActionset.scala   | 31 ++++------------
 .../dhp/sx/actionset/oozie_app/workflow.xml   |  8 ++---
 4 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
index 823187afe6..92a870e37a 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
@@ -64,26 +64,24 @@ abstract class AbstractRestClient extends Iterator[String]{
       .setSocketTimeout(timeout * 1000).build()
     val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build()
     var tries = 4
-    try {
-      while (tries > 0) {
+       while (tries > 0) {
         println(s"requesting ${r.getURI}")
-        val response = client.execute(r)
-        println(s"get response with status${response.getStatusLine.getStatusCode}")
-        if (response.getStatusLine.getStatusCode > 400) {
-          tries -= 1
+        try {
+          val response = client.execute(r)
+          println(s"get response with status${response.getStatusLine.getStatusCode}")
+          if (response.getStatusLine.getStatusCode > 400) {
+            tries -= 1
+          }
+          else
+            return IOUtils.toString(response.getEntity.getContent)
+        } catch {
+          case e: Throwable =>
+            println(s"Error on requesting ${r.getURI}")
+            e.printStackTrace()
+            tries-=1
         }
-        else
-          return IOUtils.toString(response.getEntity.getContent)
       }
       ""
-    } catch {
-      case e: Throwable =>
-        throw new RuntimeException("Error on executing request ", e)
-    } finally try client.close()
-    catch {
-      case e: IOException =>
-        throw new RuntimeException("Unable to close client ", e)
-    }
-  }
+   }
   getBufferData()
 }
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml
index c279436d79..e402d06004 100644
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@@ -9,6 +9,41 @@
 
     <artifactId>dhp-graph-provision</artifactId>
 
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>4.0.1</version>
+                <executions>
+                    <execution>
+                        <id>scala-compile-first</id>
+                        <phase>initialize</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-test-compile</id>
+                        <phase>process-test-resources</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <args>
+                        <arg>-Xmax-classfile-name</arg>
+                        <arg>200</arg>
+                    </args>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                </configuration>
+            </plugin>
+        </plugins>
+
+    </build>
+
     <dependencies>
 
         <dependency>
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala
index 6f0cdcf8aa..faf386d257 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala
@@ -43,7 +43,7 @@ object SparkCreateActionset {
     val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
 
     relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
-      .flatMap(r => List(r.getSource,r.getTarget)).distinct().write.save(s"$workingDirFolder/id_relation")
+      .flatMap(r => List(r.getSource,r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
 
 
     val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
@@ -56,35 +56,18 @@ object SparkCreateActionset {
     relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
       .write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
 
-    log.info("saving publication")
+    log.info("saving entities")
 
-    val publication:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/publication").as[Result].map(p => (p.getId, p))
+    val entities:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
 
-    publication
-      .joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
+
+    entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
+    entities
+      .joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
       .map(p => p._1._2)
       .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
 
-    log.info("saving dataset")
-    val dataset:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/dataset").as[Result].map(p => (p.getId, p))
-    dataset
-      .joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
-      .map(p => p._1._2)
-      .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
 
-    log.info("saving software")
-    val software:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/software").as[Result].map(p => (p.getId, p))
-    software
-      .joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
-      .map(p => p._1._2)
-      .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
-
-    log.info("saving Other Research product")
-    val orp:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/otherresearchproduct").as[Result].map(p => (p.getId, p))
-    orp
-      .joinWith(idRelation, publication("_1").equalTo(idRelation("value")))
-      .map(p => p._1._2)
-      .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
   }
 
 }
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
index ef86a17728..7c4b3dd269 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
@@ -14,7 +14,7 @@
         </property>
     </parameters>
 
-    <start to="ExportDataset"/>
+    <start to="createActionSet"/>
 
     <kill name="Kill">
         <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@@ -26,7 +26,7 @@
             <mode>cluster</mode>
             <name>Create Action Set</name>
             <class>eu.dnetlib.dhp.sx.provision.SparkCreateActionset</class>
-            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-memory=${sparkExecutorMemory}
                 --executor-cores=${sparkExecutorCores}
@@ -42,7 +42,7 @@
             <arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg>
             <arg>--master</arg><arg>yarn-cluster</arg>
         </spark>
-        <ok to="End"/>
+        <ok to="SaveActionSet"/>
         <error to="Kill"/>
     </action>
 
@@ -53,7 +53,7 @@
             <mode>cluster</mode>
             <name>Save Action Set</name>
             <class>eu.dnetlib.dhp.sx.provision.SparkSaveActionSet</class>
-            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-memory=${sparkExecutorMemory}
                 --executor-cores=${sparkExecutorCores}

From 6dddad86ee10543a2d96e3fe7a555bd287492e0c Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Wed, 28 Jul 2021 16:21:29 +0200
Subject: [PATCH 14/24] [cleaning] title cleaning based on the
 me.xuender:unidecode library

---
 dhp-common/pom.xml                            |  5 ++
 .../oaf/utils/GraphCleaningFunctions.java     | 19 ++---
 .../schema/oaf/utils/OafMapperUtilsTest.java  | 25 +++++--
 .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 26 +++++++
 .../eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml | 70 +++++++++++++++++++
 pom.xml                                       |  5 ++
 6 files changed, 136 insertions(+), 14 deletions(-)
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml

diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 74f31cf357..4c7810c47c 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -25,6 +25,11 @@
 			<groupId>com.github.sisyphsu</groupId>
 			<artifactId>dateparser</artifactId>
 		</dependency>
+		<dependency>
+			<groupId>me.xuender</groupId>
+			<artifactId>unidecode</artifactId>
+		</dependency>
+
 		<dependency>
 			<groupId>org.apache.spark</groupId>
 			<artifactId>spark-core_2.11</artifactId>
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
index e5181b1119..1d002ed7ec 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java
@@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter;
 import java.time.format.DateTimeParseException;
 import java.util.*;
 import java.util.function.Function;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 import org.apache.commons.lang3.StringUtils;
-import org.jetbrains.annotations.NotNull;
 
 import com.github.sisyphsu.dateparser.DateParserUtils;
 import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.schema.oaf.*;
+import me.xuender.unidecode.Unidecode;
 
 public class GraphCleaningFunctions extends CleaningFunctions {
 
@@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
 							.filter(Objects::nonNull)
 							.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
 							.filter(
-								sp -> sp
-									.getValue()
-									.toLowerCase()
-									.replaceAll(TITLE_FILTER_REGEX, "")
-									.length() > TITLE_FILTER_RESIDUAL_LENGTH)
+								sp -> {
+									final String title = sp
+										.getValue()
+										.toLowerCase();
+									final String residual = Unidecode
+										.decode(title)
+										.replaceAll(TITLE_FILTER_REGEX, "");
+									return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
+								})
 							.map(GraphCleaningFunctions::cleanValue)
 							.collect(Collectors.toList()));
 			}
diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
index eefa1e9a31..8d519a93f6 100644
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java
@@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
 import static org.junit.jupiter.api.Assertions.*;
 
 import java.io.IOException;
-import java.time.LocalDate;
-import java.time.format.DateTimeFormatter;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Locale;
-import java.util.Optional;
 import java.util.stream.Collectors;
 
 import org.apache.commons.io.IOUtils;
@@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
 
 import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import me.xuender.unidecode.Unidecode;
 
 public class OafMapperUtilsTest {
 
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
 		.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
 
+	@Test
+	public void testUnidecode() {
+
+		assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
+		assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
+		assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
+		assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
+		assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
+		assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
+		assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
+		assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
+		assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
+		assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
+	}
+
 	@Test
 	public void testDateValidation() {
 
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
index 63f18a803a..ba4211a3f2 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java
@@ -1,6 +1,8 @@
 
 package eu.dnetlib.dhp.oa.graph.raw;
 
+import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup;
+import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames;
 import static org.junit.jupiter.api.Assertions.*;
 import static org.mockito.Mockito.lenient;
 
@@ -640,6 +642,30 @@ public class MappersTest {
 		System.out.println(p.getTitle().get(0).getValue());
 	}
 
+	@Test
+	void testJairo() throws IOException {
+		final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml"));
+		final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
+
+		System.out.println("***************");
+		System.out.println(new ObjectMapper().writeValueAsString(list));
+		System.out.println("***************");
+
+		final Publication p = (Publication) list.get(0);
+		assertValidId(p.getId());
+		assertValidId(p.getCollectedfrom().get(0).getKey());
+
+		assertNotNull(p.getTitle());
+		assertFalse(p.getTitle().isEmpty());
+		assertTrue(p.getTitle().size() == 1);
+		assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
+
+		final Publication p_cleaned = cleanup(fixVocabularyNames(p));
+
+		assertNotNull(p_cleaned.getTitle());
+		assertFalse(p_cleaned.getTitle().isEmpty());
+	}
+
 	@Test
 	void testOdfFromHdfs() throws IOException {
 		final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml
new file mode 100644
index 0000000000..9ec696256f
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<record xmlns:dc="http://purl.org/dc/elements/1.1/"
+        xmlns:dr="http://www.driver-repository.eu/namespace/dr"
+        xmlns:dri="http://www.driver-repository.eu/namespace/dri"
+        xmlns:oaf="http://namespace.openaire.eu/oaf"
+        xmlns:oai="http://www.openarchives.org/OAI/2.0/"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <header xmlns="http://namespace.openaire.eu/">
+        <dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
+        <dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
+        <dri:dateOfCollection/>
+        <dri:mdFormat/>
+        <dri:mdFormatInterpretation/>
+        <dri:repositoryId/>
+        <dr:objectIdentifier/>
+        <dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
+        <dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
+        <oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
+    </header>
+    <metadata xmlns="http://namespace.openaire.eu/">
+        <dc:title>多項式GCDを用いた復号法に関する研究</dc:title>
+        <dc:creator>上原, 剛</dc:creator>
+        <dc:creator>甲斐, 博</dc:creator>
+        <dc:creator>野田, 松太郎</dc:creator>
+        <dc:format>application/pdf</dc:format>
+        <dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
+        <dc:language>jpn</dc:language>
+        <dc:publisher>京都大学数理解析研究所</dc:publisher>
+        <dc:subject classid="ndc" classname="ndc"
+                    schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
+        <dc:type>Departmental Bulletin Paper</dc:type>
+        <dr:CobjCategory type="publication">0014</dr:CobjCategory>
+        <oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
+        <oaf:projectid/>
+        <oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
+        <oaf:accessrights>OPEN</oaf:accessrights>
+        <oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
+        <oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
+        <oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
+        <oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
+        <oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
+        <oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
+        <oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
+    </metadata>
+    <about>
+        <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
+            <originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
+                <baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
+                <identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
+                <datestamp>2021-04-13T13:36:29Z</datestamp>
+                <metadataNamespace/>
+                <originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
+                    <baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
+                    <identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
+                    <datestamp>2012-07-12T14:15:41Z</datestamp>
+                    <metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
+                </originDescription>
+            </originDescription>
+        </provenance>
+        <oaf:datainfo>
+            <oaf:inferred>false</oaf:inferred>
+            <oaf:deletedbyinference>false</oaf:deletedbyinference>
+            <oaf:trust>0.9</oaf:trust>
+            <oaf:inferenceprovenance/>
+            <oaf:provenanceaction classid="sysimport:crosswalk:repository"
+                                  classname="sysimport:crosswalk:repository"
+                                  schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
+        </oaf:datainfo>
+    </about>
+</record>
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 6e4526e41c..fc4a8a21b9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -205,6 +205,11 @@
 				<artifactId>dateparser</artifactId>
 				<version>1.0.7</version>
 			</dependency>
+			<dependency>
+				<groupId>me.xuender</groupId>
+				<artifactId>unidecode</artifactId>
+				<version>0.0.7</version>
+			</dependency>
 
 			<dependency>
 				<groupId>com.google.guava</groupId>

From 4c5a71ba2f968e4a6eaf993eb2590306f92ad8d5 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Wed, 28 Jul 2021 17:11:18 +0200
Subject: [PATCH 15/24] [broker] updated relation descriptors, making use of
 constant values

---
 .../EnrichMissingDatasetIsReferencedBy.java         |  3 ++-
 .../EnrichMissingDatasetIsRelatedTo.java            |  3 ++-
 .../EnrichMissingDatasetIsSupplementedBy.java       |  3 ++-
 .../EnrichMissingDatasetIsSupplementedTo.java       |  3 ++-
 .../EnrichMissingDatasetReferences.java             |  3 ++-
 .../EnrichMissingPublicationIsReferencedBy.java     |  3 ++-
 .../EnrichMissingPublicationIsRelatedTo.java        |  3 ++-
 .../EnrichMissingPublicationIsSupplementedBy.java   |  3 ++-
 .../EnrichMissingPublicationIsSupplementedTo.java   |  3 ++-
 .../EnrichMissingPublicationReferences.java         |  3 ++-
 .../eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java | 13 +++++++------
 11 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java
index 21786687ee..bcbcf755f9 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDataset {
 
@@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDat
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isReferencedBy");
+		return relType.equals(ModelConstants.IS_REFERENCED_BY);
 	}
 
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java
index 0f3739434e..4125974ce1 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDataset {
 
@@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDatase
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isRelatedTo");
+		return relType.equals(ModelConstants.IS_RELATED_TO);
 	}
 
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java
index cde227feed..480daf6661 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingDataset {
 
@@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingD
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isSupplementedBy");
+		return relType.equals(ModelConstants.IS_SUPPLEMENTED_BY);
 	}
 
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java
index 750165ff5a..97b1eb8bdb 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingDataset {
 
@@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingD
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isSupplementedTo");
+		return relType.equals(ModelConstants.IS_SUPPLEMENT_TO);
 	}
 
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java
index b1c0afe16f..0978486a37 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset {
 
@@ -11,7 +12,7 @@ public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("references");
+		return relType.equals(ModelConstants.REFERENCES);
 	}
 
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java
index eebb5c1a66..ff9155c9d0 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissingPublication {
 
@@ -11,6 +12,6 @@ public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissin
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isReferencedBy");
+		return relType.equals(ModelConstants.IS_REFERENCED_BY);
 	}
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java
index a8aa550d44..1051559c9e 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPublication {
 
@@ -11,7 +12,7 @@ public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPu
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isRelatedTo");
+		return relType.equals(ModelConstants.IS_RELATED_TO);
 	}
 
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java
index 762ac942e4..d97f46f093 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMissingPublication {
 
@@ -11,6 +12,6 @@ public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMiss
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isSupplementedBy");
+		return relType.equals(ModelConstants.IS_SUPPLEMENTED_BY);
 	}
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java
index fc7196a015..b33b340e33 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMissingPublication {
 
@@ -11,7 +12,7 @@ public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMiss
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("isSupplementedTo");
+		return relType.equals(ModelConstants.IS_SUPPLEMENT_TO);
 	}
 
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java
index da19944549..fe0f96b6e5 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java
@@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
 
 import eu.dnetlib.dhp.broker.model.Topic;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 
 public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPublication {
 
@@ -11,7 +12,7 @@ public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPub
 
 	@Override
 	protected boolean filterByType(final String relType) {
-		return relType.equals("references");
+		return relType.equals(ModelConstants.REFERENCES);
 	}
 
 }
diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
index c7be633a9d..f578548fb6 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
@@ -5,6 +5,7 @@ import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
 
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
@@ -52,15 +53,15 @@ public class ClusterUtils {
 	}
 
 	public static boolean isDedupRoot(final String id) {
-		return id.contains("dedup_wf_");
+		return id.contains("dedup");
 	}
 
 	public static final boolean isValidResultResultClass(final String s) {
-		return s.equals("isReferencedBy")
-			|| s.equals("isRelatedTo")
-			|| s.equals("references")
-			|| s.equals("isSupplementedBy")
-			|| s.equals("isSupplementedTo");
+		return s.equals(ModelConstants.IS_REFERENCED_BY)
+			|| s.equals(ModelConstants.IS_RELATED_TO)
+			|| s.equals(ModelConstants.REFERENCES)
+			|| s.equals(ModelConstants.IS_SUPPLEMENTED_BY)
+			|| s.equals(ModelConstants.IS_SUPPLEMENT_TO);
 	}
 
 	public static <T> T incrementAccumulator(final T o, final LongAccumulator acc) {

From 3d1580fa9b81fec1a066d74e91de122d519099da Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Wed, 28 Jul 2021 18:50:31 +0300
Subject: [PATCH 16/24] fixed a typo

---
 .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
index 0c4a767a4e..378e0f17be 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql
@@ -20,7 +20,7 @@ WHERE r.reltype = 'resultProject'
 
 create table ${stats_db_name}.project_classification as
 select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
-from ${openaire_db_name}project p
+from ${openaire_db_name}.project p
     lateral view explode(p.h2020classification) classifs as class
 where p.datainfo.deletedbyinference=false and class.h2020programme is not null;
 

From 4afa5215a9ad70c86bb4bd0db9ce9fc039e4ed2a Mon Sep 17 00:00:00 2001
From: antleb <antleb@di.uoa.gr>
Date: Wed, 28 Jul 2021 21:59:12 +0300
Subject: [PATCH 17/24] fixed a NPE?

---
 .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
index 5d770dd617..76d31eb5e9 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql
@@ -55,7 +55,7 @@ CREATE TABLE ${stats_db_name}.dual
 INSERT INTO ${stats_db_name}.dual
 VALUES ('X');
 INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
-                                             `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`)
+                                             `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
 SELECT 'other',
        'Other',
        'Repository',
@@ -66,7 +66,9 @@ SELECT 'other',
        NULL,
        NULL,
        NULL,
-       'unknown'
+       'unknown',
+       null,
+       null
 FROM ${stats_db_name}.dual
 WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
 DROP TABLE ${stats_db_name}.dual;

From 3721df7aa6d54bf20c1c709a1dcd3be3d8dc3af4 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Thu, 29 Jul 2021 10:45:24 +0200
Subject: [PATCH 18/24] refactoring create actionset of scholexplorer, moved on
 package dhp-aggregation

---
 .../scholix}/SparkCreateActionset.scala       | 22 +++++++++----------
 .../scholix}/SparkSaveActionSet.scala         | 14 ++++++------
 .../dhp/sx/actionset/generate_actionset.json  |  0
 .../sx/actionset/oozie_app/config-default.xml |  0
 .../dhp/sx/actionset/oozie_app/workflow.xml   |  4 ++--
 .../dhp/sx/actionset/save_actionset.json      |  0
 6 files changed, 20 insertions(+), 20 deletions(-)
 rename dhp-workflows/{dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision => dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix}/SparkCreateActionset.scala (63%)
 rename dhp-workflows/{dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision => dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix}/SparkSaveActionSet.scala (86%)
 rename dhp-workflows/{dhp-graph-provision => dhp-aggregation}/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json (100%)
 rename dhp-workflows/{dhp-graph-provision => dhp-aggregation}/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml (100%)
 rename dhp-workflows/{dhp-graph-provision => dhp-aggregation}/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml (95%)
 rename dhp-workflows/{dhp-graph-provision => dhp-aggregation}/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json (100%)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala
similarity index 63%
rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala
index faf386d257..b78f411ee1 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala
@@ -1,9 +1,9 @@
-package eu.dnetlib.dhp.sx.provision
+package eu.dnetlib.dhp.actionmanager.scholix
 
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
-import org.apache.spark.{SparkConf, sql}
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.apache.spark.SparkConf
+import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
 
 import scala.io.Source
@@ -34,16 +34,16 @@ object SparkCreateActionset {
     val workingDirFolder = parser.get("workingDirFolder")
     log.info(s"workingDirFolder  -> $workingDirFolder")
 
-    implicit val oafEncoders:Encoder[Oaf] = Encoders.kryo[Oaf]
-    implicit val resultEncoders:Encoder[Result] = Encoders.kryo[Result]
-    implicit val relationEncoders:Encoder[Relation] = Encoders.kryo[Relation]
+    implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
+    implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
+    implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
 
-    import  spark.implicits._
+    import spark.implicits._
 
     val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
 
-    relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
-      .flatMap(r => List(r.getSource,r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
+    relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
+      .flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
 
 
     val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
@@ -53,12 +53,12 @@ object SparkCreateActionset {
 
     log.info("save relation filtered")
 
-    relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
+    relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
       .write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
 
     log.info("saving entities")
 
-    val entities:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
+    val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
 
 
     entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkSaveActionSet.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala
similarity index 86%
rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkSaveActionSet.scala
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala
index d1d0b84242..1df7ea3fb1 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkSaveActionSet.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala
@@ -1,9 +1,9 @@
-package eu.dnetlib.dhp.sx.provision
+package eu.dnetlib.dhp.actionmanager.scholix
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.schema.action.AtomicAction
-import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Software, Dataset => OafDataset}
+import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
 import org.apache.hadoop.io.Text
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.hadoop.mapred.SequenceFileOutputFormat
@@ -73,13 +73,13 @@ object SparkSaveActionSet {
     val targetPath = parser.get("targetPath")
     log.info(s"targetPath  -> $targetPath")
 
-    implicit val oafEncoders:Encoder[Oaf] = Encoders.kryo[Oaf]
-    implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
+    implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
+    implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
 
     spark.read.load(sourcePath).as[Oaf]
-      .map(o =>toActionSet(o))
-      .filter(o => o!= null)
-      .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
+      .map(o => toActionSet(o))
+      .filter(o => o != null)
+      .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
 
   }
 
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json
similarity index 100%
rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml
similarity index 100%
rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
similarity index 95%
rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
index 7c4b3dd269..8c045fcfe7 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
@@ -26,7 +26,7 @@
             <mode>cluster</mode>
             <name>Create Action Set</name>
             <class>eu.dnetlib.dhp.sx.provision.SparkCreateActionset</class>
-            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-memory=${sparkExecutorMemory}
                 --executor-cores=${sparkExecutorCores}
@@ -53,7 +53,7 @@
             <mode>cluster</mode>
             <name>Save Action Set</name>
             <class>eu.dnetlib.dhp.sx.provision.SparkSaveActionSet</class>
-            <jar>dhp-graph-provision-${projectVersion}.jar</jar>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-memory=${sparkExecutorMemory}
                 --executor-cores=${sparkExecutorCores}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json
similarity index 100%
rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json

From 908f57a4758b2aa2ff1c93cad41fcf2328c73bbe Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 29 Jul 2021 10:49:39 +0200
Subject: [PATCH 19/24] code formatting

---
 .../main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
index f578548fb6..7c4ca1d22f 100644
--- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
+++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java
@@ -5,7 +5,6 @@ import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
 
-import eu.dnetlib.dhp.schema.common.ModelConstants;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
@@ -18,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 
 public class ClusterUtils {

From e87e1805c4280e9d8ed9be9f733d331401273118 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 29 Jul 2021 12:13:06 +0200
Subject: [PATCH 20/24] [raw_all] added extra workflow step for patching the
 identifiers in the relations, given an id mapping dataset

---
 .../graph/raw/PatchRelationsApplication.java  | 115 ++++++++++++++++++
 .../graph/raw/common/RelationIdMapping.java   |  24 ++++
 .../oa/graph/patch_relations_parameters.json  |  26 ++++
 .../oa/graph/raw_all/oozie_app/workflow.xml   |  47 ++++++-
 4 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java
 create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json

diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
new file mode 100644
index 0000000000..c2bcf69f09
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java
@@ -0,0 +1,115 @@
+package eu.dnetlib.dhp.oa.graph.raw;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.oa.graph.dump.Utils;
+import eu.dnetlib.dhp.oa.graph.raw.common.RelationIdMapping;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;
+
+import java.io.FileNotFoundException;
+import java.util.Objects;
+import java.util.Optional;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+public class PatchRelationsApplication {
+
+    private static final Logger log = LoggerFactory.getLogger(PatchRelationsApplication.class);
+
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    public static void main(final String[] args) throws Exception {
+        final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                IOUtils
+                        .toString(
+                                Optional.ofNullable(
+                                PatchRelationsApplication.class
+                                        .getResourceAsStream(
+                                                "/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json"))
+                                .orElseThrow(FileNotFoundException::new)
+                        ));
+        parser.parseArgument(args);
+
+        final Boolean isSparkSessionManaged = Optional
+                .ofNullable(parser.get("isSparkSessionManaged"))
+                .map(Boolean::valueOf)
+                .orElse(Boolean.TRUE);
+        log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+        final String graphBasePath = parser.get("graphBasePath");
+        log.info("graphBasePath: {}", graphBasePath);
+
+        final String workingDir = parser.get("workingDir");
+        log.info("workingDir: {}", workingDir);
+
+        final String idMappingPath = parser.get("idMappingPath");
+        log.info("idMappingPath: {}", idMappingPath);
+
+        final SparkConf conf = new SparkConf();
+        runWithSparkSession(
+                conf,
+                isSparkSessionManaged,
+                spark -> patchRelations(spark, graphBasePath, workingDir, idMappingPath));
+    }
+
+    /**
+     * Substitutes the identifiers (source/target) from the set of relations part of the graphBasePath included in the
+     * mapping provided by the dataset stored on idMappingPath, using workingDir as intermediate storage location.
+     *
+     * @param spark the SparkSession
+     * @param graphBasePath base graph path providing the set of relations to patch
+     * @param workingDir intermediate storage location
+     * @param idMappingPath dataset providing the old -> new identifier mapping
+     */
+    private static void patchRelations(final SparkSession spark, final String graphBasePath, final String workingDir, final String idMappingPath) {
+
+        final String relationPath = graphBasePath + "/relation";
+
+        final Dataset<Relation> rels = Utils.readPath(spark, relationPath, Relation.class);
+        final Dataset<RelationIdMapping> idMapping = Utils.readPath(spark, idMappingPath, RelationIdMapping.class);
+
+        rels
+                .joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
+                .map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
+                    final Relation r = t._1();
+                    Optional.ofNullable(t._2())
+                            .map(RelationIdMapping::getNewId)
+                            .ifPresent(r::setSource);
+                    return r;
+                }, Encoders.bean(Relation.class))
+                .joinWith(idMapping, rels.col("target").equalTo(idMapping.col("oldId")), "left")
+                .map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
+                    final Relation r = t._1();
+                    Optional.ofNullable(t._2())
+                            .map(RelationIdMapping::getNewId)
+                            .ifPresent(r::setTarget);
+                    return r;
+                }, Encoders.bean(Relation.class))
+                .map(
+                        (MapFunction<Relation, String>) OBJECT_MAPPER::writeValueAsString,
+                        Encoders.STRING())
+                .write()
+                .mode(SaveMode.Overwrite)
+                .option("compression", "gzip")
+                .text(workingDir);
+
+        spark.read().textFile(workingDir)
+                .write()
+                .mode(SaveMode.Overwrite)
+                .option("compression", "gzip")
+                .text(relationPath);
+    }
+
+
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java
new file mode 100644
index 0000000000..f251da8c33
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java
@@ -0,0 +1,24 @@
+package eu.dnetlib.dhp.oa.graph.raw.common;
+
+public class RelationIdMapping {
+
+    private String oldId;
+
+    private String newId;
+
+    public String getOldId() {
+        return oldId;
+    }
+
+    public void setOldId(final String oldId) {
+        this.oldId = oldId;
+    }
+
+    public String getNewId() {
+        return newId;
+    }
+
+    public void setNewId(final String newId) {
+        this.newId = newId;
+    }
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json
new file mode 100644
index 0000000000..178c2d69bf
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json
@@ -0,0 +1,26 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "g",
+    "paramLongName": "graphBasePath",
+    "paramDescription": "base graph path providing the set of relations to patch",
+    "paramRequired": true
+  },
+  {
+    "paramName": "w",
+    "paramLongName": "workingDir",
+    "paramDescription": "intermediate storage location",
+    "paramRequired": true
+  },
+  {
+    "paramName": "i",
+    "paramLongName": "idMappingPath",
+    "paramDescription": "dataset providing the old -> new identifier mapping",
+    "paramRequired": true
+  }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
index 7f1ecb39fb..e7320de3b3 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
@@ -100,6 +100,16 @@
             <value></value>
             <description>a blacklist of nsprefixes (comma separeted)</description>
         </property>
+        <property>
+            <name>shouldPatchRelations</name>
+            <value>false</value>
+            <description>activates the relation patching phase, driven by the content in ${idMappingPath}</description>
+        </property>
+        <property>
+            <name>idMappingPath</name>
+            <value></value>
+            <description>path pointing to the relations identifiers mapping dataset</description>
+        </property>
         <property>
             <name>sparkDriverMemory</name>
             <description>memory for driver process</description>
@@ -538,7 +548,42 @@
         <error to="Kill"/>
     </action>
 
-    <join name="wait_graphs" to="fork_merge_claims"/>
+    <join name="wait_graphs" to="patchRelations"/>
+
+    <decision name="decisionPatchRelations">
+        <switch>
+            <case to="patchRelations">
+                ${(shouldPatchRelations eq "true") and
+                (fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")}
+            </case>
+            <default to="fork_merge_claims"/>
+        </switch>
+    </decision>
+
+    <action name="patchRelations">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PatchRelations</name>
+            <class>eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--graphBasePath</arg><arg>${workingDir}/graph_raw</arg>
+            <arg>--workingDir</arg><arg>${workingDir}/patch_relations</arg>
+            <arg>--idMappingPath</arg><arg>${idMappingPath}</arg>
+        </spark>
+        <ok to="fork_merge_claims"/>
+        <error to="Kill"/>
+    </action>
 
     <fork name="fork_merge_claims">
         <path start="merge_claims_publication"/>

From 5d08ad86ae45478db3742fef51c7c0ae38f30e34 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 29 Jul 2021 13:03:16 +0200
Subject: [PATCH 21/24] [raw_all] patching relation identifier phase to be run
 at the end, i.e. includes also claimed relations

---
 .../oa/graph/raw_all/oozie_app/workflow.xml   | 75 +++++++++----------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
index e7320de3b3..321ca40909 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml
@@ -548,42 +548,7 @@
         <error to="Kill"/>
     </action>
 
-    <join name="wait_graphs" to="patchRelations"/>
-
-    <decision name="decisionPatchRelations">
-        <switch>
-            <case to="patchRelations">
-                ${(shouldPatchRelations eq "true") and
-                (fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")}
-            </case>
-            <default to="fork_merge_claims"/>
-        </switch>
-    </decision>
-
-    <action name="patchRelations">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>PatchRelations</name>
-            <class>eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory ${sparkExecutorMemory}
-                --executor-cores ${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.shuffle.partitions=7680
-            </spark-opts>
-            <arg>--graphBasePath</arg><arg>${workingDir}/graph_raw</arg>
-            <arg>--workingDir</arg><arg>${workingDir}/patch_relations</arg>
-            <arg>--idMappingPath</arg><arg>${idMappingPath}</arg>
-        </spark>
-        <ok to="fork_merge_claims"/>
-        <error to="Kill"/>
-    </action>
+    <join name="wait_graphs" to="fork_merge_claims"/>
 
     <fork name="fork_merge_claims">
         <path start="merge_claims_publication"/>
@@ -596,7 +561,6 @@
         <path start="merge_claims_relation"/>
     </fork>
 
-
     <action name="merge_claims_publication">
         <spark xmlns="uri:oozie:spark-action:0.2">
             <master>yarn</master>
@@ -805,7 +769,42 @@
         <error to="Kill"/>
     </action>
 
-    <join name="wait_merge" to="End"/>
+    <join name="wait_merge" to="decisionPatchRelations"/>
+
+    <decision name="decisionPatchRelations">
+        <switch>
+            <case to="patchRelations">
+                ${(shouldPatchRelations eq "true") and
+                (fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")}
+            </case>
+            <default to="End"/>
+        </switch>
+    </decision>
+
+    <action name="patchRelations">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>PatchRelations</name>
+            <class>eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=7680
+            </spark-opts>
+            <arg>--graphBasePath</arg><arg>${graphOutputPath}</arg>
+            <arg>--workingDir</arg><arg>${workingDir}/patch_relations</arg>
+            <arg>--idMappingPath</arg><arg>${idMappingPath}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
 
     <end name="End"/>
 </workflow-app>
\ No newline at end of file

From b1b0cc3f157df3998d2fe1392e6b5b7e76f75c12 Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Thu, 29 Jul 2021 13:54:56 +0200
Subject: [PATCH 22/24] fixed wrong package name

---
 .../eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
index 8c045fcfe7..2d97b51633 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml
@@ -25,7 +25,7 @@
             <master>yarn-cluster</master>
             <mode>cluster</mode>
             <name>Create Action Set</name>
-            <class>eu.dnetlib.dhp.sx.provision.SparkCreateActionset</class>
+            <class>eu.dnetlib.dhp.actionmanager.scholix.SparkCreateActionset</class>
             <jar>dhp-aggregation-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-memory=${sparkExecutorMemory}
@@ -52,7 +52,7 @@
             <master>yarn-cluster</master>
             <mode>cluster</mode>
             <name>Save Action Set</name>
-            <class>eu.dnetlib.dhp.sx.provision.SparkSaveActionSet</class>
+            <class>eu.dnetlib.dhp.actionmanager.scholix.SparkSaveActionSet</class>
             <jar>dhp-aggregation-${projectVersion}.jar</jar>
             <spark-opts>
                 --executor-memory=${sparkExecutorMemory}

From c53d106e80eca2af6456925130dd856bf5d33c49 Mon Sep 17 00:00:00 2001
From: Claudio Atzori <claudio.atzori@isti.cnr.it>
Date: Thu, 29 Jul 2021 13:56:37 +0200
Subject: [PATCH 23/24] [provision] lowercase relation filter

---
 .../java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
index 7d53d35549..b3f7854924 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java
@@ -10,6 +10,7 @@ import java.util.Set;
 import java.util.stream.Collectors;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -81,6 +82,7 @@ public class PrepareRelationsJob {
 
 		Set<String> relationFilter = Optional
 			.ofNullable(parser.get("relationFilter"))
+			.map(String::toLowerCase)
 			.map(s -> Sets.newHashSet(Splitter.on(",").split(s)))
 			.orElse(new HashSet<>());
 		log.info("relationFilter: {}", relationFilter);
@@ -130,7 +132,7 @@ public class PrepareRelationsJob {
 
 		JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
 			.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
-			.filter(rel -> relationFilter.contains(rel.getRelClass()) == false);
+			.filter(rel -> relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())) == false);
 
 		JavaRDD<Relation> pruned = pruneRels(
 			pruneRels(

From 6358f92c3a5301bd25ec00259453490d19e318aa Mon Sep 17 00:00:00 2001
From: Sandro La Bruzzo <sandro.labruzzo@isti.cnr.it>
Date: Fri, 30 Jul 2021 08:54:25 +0200
Subject: [PATCH 24/24] added sleep to solve problem of lost request of
 creating index

---
 .../java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java
index f96a64a27b..ffeb0995d1 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java
@@ -71,6 +71,9 @@ public class DropAndCreateESIndex {
 			log.info(STATUS_CODE_TEXT, response.getStatusLine());
 		}
 
+		log.info("Sleeping 60 seconds to avoid to lost the creation of index request");
+		Thread.sleep(60000);
+
 		try (CloseableHttpClient client = HttpClients.createDefault()) {
 
 			final String summaryConf = IOUtils