Merge pull request '[stats wf] Bug fixes' (#308) from antonis.lempesis/dnet-hadoop:beta into beta
Reviewed-on: #308
This commit is contained in:
commit
b9748763e2
|
@ -7,7 +7,7 @@ then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#export HADOOP_USER_NAME="dimitris.pierrakos"
|
#export HADOOP_USER_NAME="dimitris.pierrakos"
|
||||||
export HADOOP_USER_NAME=$5
|
export HADOOP_USER_NAME=$6
|
||||||
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
@ -21,8 +21,22 @@ function copydb() {
|
||||||
# change ownership to impala
|
# change ownership to impala
|
||||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
||||||
|
|
||||||
# create the databases
|
# drop tables from db
|
||||||
|
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
||||||
|
do
|
||||||
|
`impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop table $i;"`;
|
||||||
|
done
|
||||||
|
|
||||||
|
# drop views from db
|
||||||
|
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
||||||
|
do
|
||||||
|
`impala-shell -i impala-cluster-dn1.openaire.eu -d -d ${db} -q "drop view $i;"`;
|
||||||
|
done
|
||||||
|
|
||||||
|
# delete the database
|
||||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
|
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
|
||||||
|
|
||||||
|
# create the databases
|
||||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
|
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
|
||||||
|
|
||||||
impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
|
impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
|
||||||
|
|
|
@ -49,5 +49,5 @@ FROM (
|
||||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
|
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
|
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
|
||||||
select distinct substr(id,4),id, accessroute from ${openaire_db_name}.result
|
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
|
||||||
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute;
|
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute;
|
||||||
|
|
|
@ -34,3 +34,11 @@ union all
|
||||||
select * from ${stats_db_name}.software_refereed
|
select * from ${stats_db_name}.software_refereed
|
||||||
union all
|
union all
|
||||||
select * from ${stats_db_name}.otherresearchproduct_refereed;
|
select * from ${stats_db_name}.otherresearchproduct_refereed;
|
||||||
|
|
||||||
|
create table if not exists indi_impact_measures as
|
||||||
|
select distinct substr(id, 4) as id, measures_ids.id impactmetric, measures_ids.unit.value[0] score,
|
||||||
|
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class
|
||||||
|
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
|
||||||
|
where measures_ids.id!='views' and measures_ids.id!='downloads';
|
||||||
|
|
||||||
|
ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS;
|
||||||
|
|
|
@ -342,40 +342,6 @@ FROM publication_datasources pd
|
||||||
|
|
||||||
ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
|
ANALYZE TABLE indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
|
||||||
|
|
||||||
create table if not exists indi_pub_bronze_oa stored as parquet as
|
|
||||||
WITH hybrid_oa AS (
|
|
||||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
|
||||||
FROM STATS_EXT.plan_s_jn
|
|
||||||
WHERE issn_print != ""
|
|
||||||
UNION ALL
|
|
||||||
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
|
|
||||||
FROM STATS_EXT.plan_s_jn
|
|
||||||
WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
|
|
||||||
issn AS (
|
|
||||||
SELECT *
|
|
||||||
FROM (
|
|
||||||
SELECT id, issn_printed as issn
|
|
||||||
FROM datasource
|
|
||||||
WHERE issn_printed IS NOT NULL
|
|
||||||
UNION ALL
|
|
||||||
SELECT id,issn_online as issn
|
|
||||||
FROM datasource
|
|
||||||
WHERE issn_online IS NOT NULL ) as issn
|
|
||||||
WHERE LENGTH(issn) > 7)
|
|
||||||
SELECT DISTINCT pd.id, coalesce(is_bronze_oa, 0) as is_hybrid_oa
|
|
||||||
FROM publication_datasources pd
|
|
||||||
LEFT OUTER JOIN (
|
|
||||||
SELECT pd.id, 1 as is_bronze_oa from publication_datasources pd
|
|
||||||
JOIN datasource d on d.id=pd.datasource
|
|
||||||
JOIN issn on issn.id=pd.datasource
|
|
||||||
JOIN hybrid_oa ON issn.issn = hybrid_oa.issn
|
|
||||||
JOIN indi_result_has_cc_licence cc on pd.id=cc.id
|
|
||||||
JOIN indi_pub_gold_oa ga on pd.id=ga.id
|
|
||||||
JOIN indi_pub_hybrid_oa_with_cc hy on hy.id=pd.id
|
|
||||||
where cc.has_cc_license=0 and ga.is_gold=0 and hy.is_hybrid_oa=0) tmp on pd.id=tmp.id;
|
|
||||||
|
|
||||||
ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS;
|
|
||||||
|
|
||||||
create table if not exists indi_pub_hybrid stored as parquet as
|
create table if not exists indi_pub_hybrid stored as parquet as
|
||||||
WITH gold_oa AS ( SELECT
|
WITH gold_oa AS ( SELECT
|
||||||
issn_l,
|
issn_l,
|
||||||
|
@ -775,26 +741,61 @@ from result p
|
||||||
|
|
||||||
ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS;
|
ANALYZE TABLE indi_result_with_pid COMPUTE STATISTICS;
|
||||||
|
|
||||||
create table if not exists indi_impact_measures as
|
|
||||||
select distinct substr(id, 4), measures_ids.id impactmetric, measures_ids.unit.value[0] score,
|
|
||||||
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] class
|
|
||||||
from result lateral view explode(measures) measures as measures_ids
|
|
||||||
where measures_ids.id!='views' and measures_ids.id!='downloads';
|
|
||||||
|
|
||||||
ANALYZE TABLE indi_impact_measures COMPUTE STATISTICS;
|
|
||||||
|
|
||||||
CREATE TEMPORARY TABLE pub_fos_totals as
|
CREATE TEMPORARY TABLE pub_fos_totals as
|
||||||
select rf.id, count(distinct lvl3) totals from result_fos rf
|
select rf.id, count(distinct lvl3) totals from result_fos rf
|
||||||
group by rf.id;
|
group by rf.id;
|
||||||
|
|
||||||
create table if not exists indi_pub_interdisciplinarity as
|
create table if not exists indi_pub_interdisciplinarity as
|
||||||
select distinct p.id, coalesce(indi_pub_is_interdisciplinary, 0)
|
select distinct p.id as id, coalesce(indi_pub_is_interdisciplinary, 0)
|
||||||
as indi_pub_is_interdisciplinary
|
as indi_pub_is_interdisciplinary
|
||||||
from pub_fos_totals p
|
from pub_fos_totals p
|
||||||
left outer join (
|
left outer join (
|
||||||
select pub_fos_totals.id, 1 as indi_pub_is_interdisciplinary from pub_fos_totals
|
select pub_fos_totals.id, 1 as indi_pub_is_interdisciplinary from pub_fos_totals
|
||||||
where totals>10) tmp on p.id=tmp.id;
|
where totals>1) tmp on p.id=tmp.id;
|
||||||
|
|
||||||
drop table pub_fos_totals purge;
|
drop table pub_fos_totals purge;
|
||||||
|
|
||||||
ANALYZE TABLE indi_pub_interdisciplinarity COMPUTE STATISTICS;
|
ANALYZE TABLE indi_pub_interdisciplinarity COMPUTE STATISTICS;
|
||||||
|
|
||||||
|
create table if not exists indi_pub_bronze_oa stored as parquet as
|
||||||
|
select distinct p.id, coalesce(is_bronze_oa,0) as is_bronze_oa
|
||||||
|
from publication p
|
||||||
|
left outer join
|
||||||
|
(select p.id, 1 as is_bronze_oa from publication p
|
||||||
|
join indi_result_has_cc_licence cc on cc.id=p.id
|
||||||
|
join indi_pub_gold_oa ga on ga.id=p.id
|
||||||
|
where cc.has_cc_license=0 and ga.is_gold=0) tmp on tmp.id=p.id;
|
||||||
|
|
||||||
|
-- create table if not exists indi_pub_bronze_oa stored as parquet as
|
||||||
|
-- WITH hybrid_oa AS (
|
||||||
|
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
|
||||||
|
-- FROM STATS_EXT.plan_s_jn
|
||||||
|
-- WHERE issn_print != ""
|
||||||
|
-- UNION ALL
|
||||||
|
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_online as issn
|
||||||
|
-- FROM STATS_EXT.plan_s_jn
|
||||||
|
-- WHERE issn_online != "" and (journal_is_in_doaj = FALSE OR journal_is_oa = FALSE)),
|
||||||
|
-- issn AS (
|
||||||
|
-- SELECT *
|
||||||
|
-- FROM (
|
||||||
|
-- SELECT id, issn_printed as issn
|
||||||
|
-- FROM datasource
|
||||||
|
-- WHERE issn_printed IS NOT NULL
|
||||||
|
-- UNION ALL
|
||||||
|
-- SELECT id,issn_online as issn
|
||||||
|
-- FROM datasource
|
||||||
|
-- WHERE issn_online IS NOT NULL ) as issn
|
||||||
|
-- WHERE LENGTH(issn) > 7)
|
||||||
|
--SELECT DISTINCT pd.id, coalesce(is_bronze_oa, 0) as is_bronze_oa
|
||||||
|
--FROM publication_datasources pd
|
||||||
|
-- LEFT OUTER JOIN (
|
||||||
|
-- SELECT pd.id, 1 as is_bronze_oa from publication_datasources pd
|
||||||
|
-- JOIN datasource d on d.id=pd.datasource
|
||||||
|
-- JOIN issn on issn.id=pd.datasource
|
||||||
|
-- JOIN hybrid_oa ON issn.issn = hybrid_oa.issn
|
||||||
|
-- JOIN indi_result_has_cc_licence cc on pd.id=cc.id
|
||||||
|
-- JOIN indi_pub_gold_oa ga on pd.id=ga.id
|
||||||
|
-- JOIN indi_pub_hybrid_oa_with_cc hy on hy.id=pd.id
|
||||||
|
-- where cc.has_cc_license=0 and ga.is_gold=0 and hy.is_hybrid_oa=0) tmp on pd.id=tmp.id;
|
||||||
|
|
||||||
|
ANALYZE TABLE indi_pub_bronze_oa COMPUTE STATISTICS;
|
Loading…
Reference in New Issue