From 2032b0df40c05693f108b6878e0a97cd8aacbb01 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 14 Jun 2023 19:09:09 +0300 Subject: [PATCH 1/2] Bug fixes 1. Remove tables/views from old databases in the new cluster, before dropping the dbs 2. Fix id in result_accessroute, indi_impact_measures, indi_pub_bronze_oa --- .../oozie_app/copyDataToImpalaCluster.sh | 18 +++- .../graph/stats/oozie_app/scripts/step14.sql | 2 +- .../graph/stats/oozie_app/scripts/step15.sql | 10 +- .../scripts/step16-createIndicatorsTables.sql | 91 ++++++++++--------- 4 files changed, 72 insertions(+), 49 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index b937eea25..392a0b6ba 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -7,7 +7,7 @@ then fi #export HADOOP_USER_NAME="dimitris.pierrakos" -export HADOOP_USER_NAME=$5 +export HADOOP_USER_NAME=$6 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" function copydb() { db=$1 @@ -21,8 +21,22 @@ function copydb() { # change ownership to impala hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db - # create the databases + # drop tables from db + for i in `impala-shell --user $HADOOP_USER_NAME-i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop table $i;"`; + done + + # drop views from db + for i in `impala-shell --user $HADOOP_USER_NAME-i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop view $i;"`; + done + + # delete the database impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + + # create the databases impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql index dc9e6c1f9..39755d68e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql @@ -49,5 +49,5 @@ FROM ( WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as -select distinct substr(id,4),id, accessroute from ${openaire_db_name}.result +select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index cec22cd3e..7eba908fd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -33,4 +33,12 @@ select * from ${stats_db_name}.dataset_refereed union all select * from ${stats_db_name}.software_refereed union all -select * from ${stats_db_name}.otherresearchproduct_refereed; \ No newline at end of file +select * from ${stats_db_name}.otherresearchproduct_refereed; + +create table if not exists indi_impact_measures as +select distinct substr(id, 4) as id, measures_ids.id impactmetric, measures_ids.unit.value[0] score, +cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class +from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids +where measures_ids.id!='views' and measures_ids.id!='downloads'; + +ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 21d4efbf7..2a4742fac 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -342,40 +342,6 @@ FROM publication_datasources pd ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS; - create table if not exists indi_pub_bronze_oa stored as parquet as - WITH hybrid_oa AS ( - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn - FROM STATS_EXT.plan_s_jn - WHERE issn_print != "" - UNION ALL - SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn - FROM STATS_EXT.plan_s_jn - WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), - issn AS ( - SELECT * - FROM ( - SELECT id, issn_printed as issn - FROM datasource - WHERE issn_printed IS NOT NULL - UNION ALL - SELECT id,issn_online as issn - FROM datasource - WHERE issn_online IS NOT NULL ) as issn - WHERE LENGTH(issn) > 7) -SELECT DISTINCT pd.id, coalesce(is_bronze_oa, 0) as is_hybrid_oa -FROM publication_datasources pd - LEFT OUTER JOIN ( - SELECT pd.id, 1 as is_bronze_oa from publication_datasources pd - JOIN datasource d on d.id=pd.datasource - JOIN issn on issn.id=pd.datasource - JOIN hybrid_oa ON issn.issn = hybrid_oa.issn - JOIN indi_result_has_cc_licence cc on pd.id=cc.id - JOIN indi_pub_gold_oa ga on pd.id=ga.id - JOIN indi_pub_hybrid_oa_with_cc hy on hy.id=pd.id - where cc.has_cc_license=0 and ga.is_gold=0 and hy.is_hybrid_oa=0) tmp on pd.id=tmp.id; - -ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS; - create table if not exists indi_pub_hybrid stored as parquet as WITH gold_oa AS ( SELECT issn_l, @@ -775,26 +741,61 @@ from result p ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS; -create table if not exists indi_impact_measures as -select distinct substr(id, 4), measures_ids.id impactmetric, measures_ids.unit.value[0] score, -cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class -from result lateral view explode(measures) measures as measures_ids -where measures_ids.id!='views' and measures_ids.id!='downloads'; - -ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS; - CREATE TEMPORARY TABLE pub_fos_totals as select rf.id, count(distinct lvl3) totals from result_fos rf group by rf.id; create table if not exists indi_pub_interdisciplinarity as -select distinct p.id, coalesce(indi_pub_is_interdisciplinary, 0) +select distinct p.id as id, coalesce(indi_pub_is_interdisciplinary, 0) as indi_pub_is_interdisciplinary from pub_fos_totals p left outer join ( select pub_fos_totals.id, 1 as indi_pub_is_interdisciplinary from pub_fos_totals -where totals>10) tmp on p.id=tmp.id; +where totals>1) tmp on p.id=tmp.id; drop table pub_fos_totals purge; -ANALYZE TABLE indi_pub_interdisciplinarity COMPUTE STATISTICS; \ No newline at end of file +ANALYZE TABLE indi_pub_interdisciplinarity COMPUTE STATISTICS; + +create table if not exists indi_pub_bronze_oa stored as parquet as +select distinct p.id, coalesce(is_bronze_oa,0) as is_bronze_oa +from publication p +left outer join +(select p.id, 1 as is_bronze_oa from publication p +join indi_result_has_cc_licence cc on cc.id=p.id +join indi_pub_gold_oa ga on ga.id=p.id +where cc.has_cc_license=0 and ga.is_gold=0) tmp on tmp.id=p.id; + +-- create table if not exists indi_pub_bronze_oa stored as parquet as +-- WITH hybrid_oa AS ( +-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn +-- FROM STATS_EXT.plan_s_jn +-- WHERE issn_print != "" +-- UNION ALL +-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn +-- FROM STATS_EXT.plan_s_jn +-- WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)), +-- issn AS ( +-- SELECT * +-- FROM ( +-- SELECT id, issn_printed as issn +-- FROM datasource +-- WHERE issn_printed IS NOT NULL +-- UNION ALL +-- SELECT id,issn_online as issn +-- FROM datasource +-- WHERE issn_online IS NOT NULL ) as issn +-- WHERE LENGTH(issn) > 7) +--SELECT DISTINCT pd.id, coalesce(is_bronze_oa, 0) as is_bronze_oa +--FROM publication_datasources pd +-- LEFT OUTER JOIN ( +-- SELECT pd.id, 1 as is_bronze_oa from publication_datasources pd +-- JOIN datasource d on d.id=pd.datasource +-- JOIN issn on issn.id=pd.datasource +-- JOIN hybrid_oa ON issn.issn = hybrid_oa.issn +-- JOIN indi_result_has_cc_licence cc on pd.id=cc.id +-- JOIN indi_pub_gold_oa ga on pd.id=ga.id +-- JOIN indi_pub_hybrid_oa_with_cc hy on hy.id=pd.id +-- where cc.has_cc_license=0 and ga.is_gold=0 and hy.is_hybrid_oa=0) tmp on pd.id=tmp.id; + +ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS; \ No newline at end of file From 42b8ce2ba452d04460a90c5799a2f10f60ebcd4e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 14 Jun 2023 19:23:42 +0300 Subject: [PATCH 2/2] Update copyDataToImpalaCluster.sh --- .../dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 392a0b6ba..87294f6e9 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -22,13 +22,13 @@ function copydb() { hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db # drop tables from db - for i in `impala-shell --user $HADOOP_USER_NAME-i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; do `impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop table $i;"`; done # drop views from db - for i in `impala-shell --user $HADOOP_USER_NAME-i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; do `impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop view $i;"`; done