convert_hive_to_spark_actions #1

Merged
antonis.lempesis merged 20 commits from convert_hive_to_spark_actions into beta 2024-09-23 13:53:29 +02:00
5 changed files with 36 additions and 60 deletions
Showing only changes of commit 7ce051d766 - Show all commits

View File

@ -17,4 +17,4 @@ FROM ${stats_db_name}.result_projects,
${stats_db_name}.project
WHERE result_projects.id = result.id
AND result.type = 'publication'
AND project.id = result_projects.project; -- /*EOS*/
AND project.id = result_projects.project; /*EOS*/

View File

@ -66,4 +66,4 @@ DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE;
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/

View File

@ -137,7 +137,7 @@ select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl
from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/

View File

@ -41,31 +41,31 @@ DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, langs.languages AS language
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; -- /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; -- /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; -- /*EOS*/
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/
-- datasource sources:
-- where the datasource info have been collected from.
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; -- /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/
create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -- /*EOS*/
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources; -- /*EOS*/
FROM ${stats_db_name}.result_datasources; /*EOS*/

View File

@ -307,7 +307,7 @@
<error to="Kill"/>
</action>
<!-- <action name="Step8">
<action name="Step8">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -315,29 +315,18 @@
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
&#45;&#45;conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>&#45;&#45;hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>&#45;&#45;sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg>
<arg>&#45;&#45;stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>&#45;&#45;openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark>
<ok to="Step9"/>
<error to="Kill"/>
</action>-->
<action name="Step8">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step8.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step9"/>
<error to="Kill"/>
</action>
<action name="Step9">
@ -381,12 +370,11 @@
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark>
<!-- <ok to="Step11"/>-->
<ok to="End"/>
<ok to="Step11"/>
<error to="Kill"/>
</action>
<!-- <action name="Step11">
<action name="Step11">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -394,31 +382,19 @@
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
&#45;&#45;conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>&#45;&#45;hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>&#45;&#45;sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg>
<arg>&#45;&#45;stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>&#45;&#45;openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>&#45;&#45;external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark>
<ok to="Step12"/>
<error to="Kill"/>
</action>-->
<action name="Step11">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step11.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<ok to="Step12"/>
<error to="Kill"/>
</action>
<action name="Step12">
@ -533,7 +509,7 @@
</action>
<action name="Contexts">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>contexts.sh</exec>
@ -624,7 +600,7 @@
</action>
<action name="Step19-finalize">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizedb.sh</exec>
@ -637,7 +613,7 @@
</action>
<action name="step20-createMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>monitor.sh</exec>
@ -670,7 +646,7 @@
<!-- </action>-->
<action name="step21-createObservatoryDB-pre">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>observatory-pre.sh</exec>
@ -706,7 +682,7 @@
</action>
<action name="step21-createObservatoryDB-post">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>observatory-post.sh</exec>
@ -719,7 +695,7 @@
</action>
<action name="step22-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
@ -738,7 +714,7 @@
</action>
<action name="step22a-createPDFsAggregated">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>createPDFsAggregated.sh</exec>
@ -754,7 +730,7 @@
</action>
<action name="step23-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
@ -773,7 +749,7 @@
</action>
<action name="Step24-updateCache">
<shell xmlns="uri:oozie:shell-action:0.1">
<shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>updateCache.sh</exec>