diff --git a/dhp-workflows/dhp-stats-monitor-irish/pom.xml b/dhp-workflows/dhp-stats-monitor-irish/pom.xml new file mode 100644 index 000000000..9d850a4bb --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-irish/pom.xml @@ -0,0 +1,32 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.2.4-SNAPSHOT + + 4.0.0 + dhp-stats-monitor-irish + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + + pl.project13.maven + git-commit-id-plugin + 2.1.11 + + false + + + + + diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/config-default.xml new file mode 100644 index 000000000..b2a1322e6 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh new file mode 100644 index 000000000..78c44ed20 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -0,0 +1,70 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export HADOOP_USER_NAME=$2 + +function copydb() { + + db=$1 + FILE=("hive_wf_tmp_"$RANDOM) + hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/ + + # change ownership to impala +# hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/ + + + # copy the databases from ocean to impala + echo "copying $db" + hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/ + + hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db + + # drop tables from db + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`; + done + + # drop views from db + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`; + done + + # delete the database + impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade"; + + # create the databases + impala-shell -i impala-cluster-dn1.openaire.eu -q "create database ${db}"; + + impala-shell -q "INVALIDATE METADATA" + echo "creating schema for ${db}" + for (( k = 0; k < 5; k ++ )); do + for i in `impala-shell -d ${db} --delimited -q "show tables"`; + do + impala-shell -d ${db} --delimited -q "show create table $i"; + done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f - + done + + # load the data from /tmp in the respective tables + echo "copying data in tables and computing stats" + for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`; + do + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i"; + impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i"; + done + + # deleting the remaining directory from hdfs +hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db +} + +MONITOR_DB=$1 +#HADOOP_USER_NAME=$2 +copydb $MONITOR_DB + diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/finalizeImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/finalizeImpalaCluster.sh new file mode 100644 index 000000000..38a2f61bc --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/finalizeImpalaCluster.sh @@ -0,0 +1,23 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +SOURCE=$1 +PRODUCTION=$2 +echo ${SOURCE} +echo ${PRODUCTION} + +#echo "Updating ${PRODUCTION} monitor database old cluster" +#impala-shell -q "create database if not exists ${PRODUCTION}" +#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - +#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - + +echo "Updating ${PRODUCTION} monitor database" +impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}" +impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - +impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f - +echo "Production monitor db ready!" diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/monitor_irish.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/monitor_irish.sh new file mode 100644 index 000000000..27e399e98 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/monitor_irish.sh @@ -0,0 +1,28 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export TARGET=$2 +export SHADOW=$3 +export SCRIPT_PATH=$4 +export GRAPHDB=$5 + + +export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228" +export HADOOP_USER_NAME="oozie" + +echo "Getting file from " $4 +hdfs dfs -copyToLocal $4 + +#update Monitor DB IRISH +#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo +cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$5/g1" > foo +hive $HIVE_OPTS -f foo + +echo "Hive shell finished" + diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/scripts/buildIrishMonitorDB.sql b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/scripts/buildIrishMonitorDB.sql new file mode 100644 index 000000000..db76fdab3 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/scripts/buildIrishMonitorDB.sql @@ -0,0 +1,224 @@ +drop database if exists TARGET cascade; +create database if not exists TARGET; + +create view if not exists TARGET.category as select * from SOURCE.category; +create view if not exists TARGET.concept as select * from SOURCE.concept; +create view if not exists TARGET.context as select * from SOURCE.context; +create view if not exists TARGET.country as select * from SOURCE.country; +create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; +create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; +--create view if not exists TARGET.funder as select * from SOURCE.funder; +create view if not exists TARGET.fundref as select * from SOURCE.fundref; +create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; + +drop table if exists TARGET.irish_funders; + +create TEMPORARY table TARGET.irish_funders as +select distinct xpath_string(fundingtree[0].value, '//funder/name') as funder from GRAPHDB.project + where xpath_string(fundingtree[0].value, '//funder/jurisdiction')='IE'; +--create TEMPORARY table TARGET.irish_funders as +--select distinct name as funder from SOURCE.fundref where country='IE'; + +drop table if exists TARGET.result; + +create table TARGET.result stored as parquet as + select distinct * from SOURCE.result r where r.id in (select ro.id from SOURCE.result_organization ro + left outer join SOURCE.organization o on o.id=ro.organization + left outer join SOURCE.result_projects rp on rp.id=ro.id + left outer join SOURCE.project p on p.id=rp.project + join TARGET.irish_funders irf + where o.country='IE' or p.funder=irf.funder); + +create view if not exists TARGET.category as select * from SOURCE.category; +create view if not exists TARGET.concept as select * from SOURCE.concept; +create view if not exists TARGET.context as select * from SOURCE.context; +create view if not exists TARGET.country as select * from SOURCE.country; +create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp; +create view if not exists TARGET.creation_date as select * from SOURCE.creation_date; + +create table TARGET.funder stored as parquet as select * from SOURCE.funder where country='IE'; + +create view if not exists TARGET.fundref as select * from SOURCE.fundref; +create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture; +create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure; +create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents; +create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers; +create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft; +create view if not exists TARGET.hrrst as select * from SOURCE.hrrst; +--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates; + +create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized; + +create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); +create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); +create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; +drop view TARGET.foo1; +drop view TARGET.foo2; + +-- datasources +create view if not exists TARGET.datasource as select * from SOURCE.datasource; +create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids; +create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations; +create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources; + +create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources; + +-- organizations +create view if not exists TARGET.organization as select * from SOURCE.organization; +create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources; +create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids; +create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects; +create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources; + +-- projects +create view if not exists TARGET.project as select * from SOURCE.project; +create view if not exists TARGET.project_oids as select * from SOURCE.project_oids; +create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations; +create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount; +create view if not exists TARGET.project_classification as select * from SOURCE.project_classification; +create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution; + +create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects; + + +-- indicators +-- Sprint 1 ---- +create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +-- Sprint 2 ---- +create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +---- Sprint 3 ---- +create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab; +create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab; +create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org; +create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country; +create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab; +create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab; +---- Sprint 4 ---- +create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +---- Sprint 5 ---- +create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +---- Sprint 6 ---- +create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); + +create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); + +create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); + +create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); + +---- Sprint 7 ---- +create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness; +create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr; +create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year; +create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub; +create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year; +create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year; +create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable; +create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess; +create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year; +create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id); + +create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id); +create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s; +create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness; +create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess; +create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable; +create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness; +create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess; +create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable; +create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id); +create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/workflow.xml new file mode 100644 index 000000000..e49552c60 --- /dev/null +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/workflow.xml @@ -0,0 +1,118 @@ + + + + stats_db_name + the target stats database name + + + graph_db_name + the graph database name + + + monitor_irish_db_name + the target monitor db name + + + monitor_irish_db_prod_name + the name of the production monitor db + + + monitor_irish_db_shadow_name + the name of the shadow monitor db + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + + hadoop_user_name + user name of the wf owner + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + hive.txn.timeout + ${hive_timeout} + + + mapred.job.queue.name + analytics + + + + + + + + ${wf:conf('resumeFrom') eq 'Step1-buildIrishMonitorDB'} + ${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'} + ${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${jobTracker} + ${nameNode} + monitor_irish.sh + ${stats_db_name} + ${monitor_irish_db_name} + ${monitor_irish_db_shadow_name} + ${wf:appPath()}/scripts/buildIrishMonitorDB.sql + ${graph_db_name} + monitor_irish.sh + + + + + + + + ${jobTracker} + ${nameNode} + copyDataToImpalaCluster.sh + ${monitor_irish_db_name} + ${hadoop_user_name} + copyDataToImpalaCluster.sh + + + + + + + + ${jobTracker} + ${nameNode} + finalizeImpalaCluster.sh + ${monitor_irish_db_name} + ${monitor_irish_db_prod_name} + ${monitor_irish_db_shadow_name} + finalizeImpalaCluster.sh + + + + + + +