- Update the remaining hive-actions to spark-actions.

- Update the version of shell-actions.
- Fix missing "/*EOS*/" indicators.
This commit is contained in:
Lampros Smyrnaios 2024-07-03 19:49:19 +03:00
parent aa4d7d5e20
commit 7ce051d766
5 changed files with 36 additions and 60 deletions

View File

@ -17,4 +17,4 @@ FROM ${stats_db_name}.result_projects,
${stats_db_name}.project ${stats_db_name}.project
WHERE result_projects.id = result.id WHERE result_projects.id = result.id
AND result.type = 'publication' AND result.type = 'publication'
AND project.id = result_projects.project; -- /*EOS*/ AND project.id = result_projects.project; /*EOS*/

View File

@ -66,4 +66,4 @@ DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/

View File

@ -135,9 +135,9 @@ with
lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification') lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4 select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
from lvl1 from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/

View File

@ -41,31 +41,31 @@ DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, langs.languages AS language SELECT substr(d.id, 4) AS id, langs.languages AS language
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/ where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; -- /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
SELECT substr(d.id, 4) AS id, oids.ids AS oid SELECT substr(d.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/ where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; -- /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; -- /*EOS*/ WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/
-- datasource sources: -- datasource sources:
-- where the datasource info have been collected from. -- where the datasource info have been collected from.
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; -- /*EOS*/ DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/
create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -- /*EOS*/ where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result SELECT datasource AS id, id AS result
FROM ${stats_db_name}.result_datasources; -- /*EOS*/ FROM ${stats_db_name}.result_datasources; /*EOS*/

View File

@ -307,7 +307,7 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="Step8"> <action name="Step8">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -315,29 +315,18 @@
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
&#45;&#45;conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts} ${sparkClusterOpts}
${sparkResourceOpts} ${sparkResourceOpts}
${sparkApplicationOpts} ${sparkApplicationOpts}
</spark-opts> </spark-opts>
<arg>&#45;&#45;hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg> <arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>&#45;&#45;sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg> <arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg>
<arg>&#45;&#45;stats_db_name</arg><arg>${stats_db_name}</arg> <arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>&#45;&#45;openaire_db_name</arg><arg>${openaire_db_name}</arg> <arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
</spark> </spark>
<ok to="Step9"/> <ok to="Step9"/>
<error to="Kill"/> <error to="Kill"/>
</action>-->
<action name="Step8">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step8.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step9"/>
<error to="Kill"/>
</action> </action>
<action name="Step9"> <action name="Step9">
@ -381,12 +370,11 @@
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg> <arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg> <arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark> </spark>
<!-- <ok to="Step11"/>--> <ok to="Step11"/>
<ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="Step11"> <action name="Step11">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -394,31 +382,19 @@
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class> <class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar> <jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
&#45;&#45;conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts} ${sparkClusterOpts}
${sparkResourceOpts} ${sparkResourceOpts}
${sparkApplicationOpts} ${sparkApplicationOpts}
</spark-opts> </spark-opts>
<arg>&#45;&#45;hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg> <arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>&#45;&#45;sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg> <arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg>
<arg>&#45;&#45;stats_db_name</arg><arg>${stats_db_name}</arg> <arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>&#45;&#45;openaire_db_name</arg><arg>${openaire_db_name}</arg> <arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
<arg>&#45;&#45;external_stats_db_name</arg><arg>${external_stats_db_name}</arg> <arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark> </spark>
<ok to="Step12"/> <ok to="Step12"/>
<error to="Kill"/> <error to="Kill"/>
</action>-->
<action name="Step11">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step11.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<ok to="Step12"/>
<error to="Kill"/>
</action> </action>
<action name="Step12"> <action name="Step12">
@ -533,7 +509,7 @@
</action> </action>
<action name="Contexts"> <action name="Contexts">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>contexts.sh</exec> <exec>contexts.sh</exec>
@ -624,7 +600,7 @@
</action> </action>
<action name="Step19-finalize"> <action name="Step19-finalize">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>finalizedb.sh</exec> <exec>finalizedb.sh</exec>
@ -637,7 +613,7 @@
</action> </action>
<action name="step20-createMonitorDB"> <action name="step20-createMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>monitor.sh</exec> <exec>monitor.sh</exec>
@ -670,7 +646,7 @@
<!-- </action>--> <!-- </action>-->
<action name="step21-createObservatoryDB-pre"> <action name="step21-createObservatoryDB-pre">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>observatory-pre.sh</exec> <exec>observatory-pre.sh</exec>
@ -706,7 +682,7 @@
</action> </action>
<action name="step21-createObservatoryDB-post"> <action name="step21-createObservatoryDB-post">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>observatory-post.sh</exec> <exec>observatory-post.sh</exec>
@ -719,7 +695,7 @@
</action> </action>
<action name="step22-copyDataToImpalaCluster"> <action name="step22-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec> <exec>copyDataToImpalaCluster.sh</exec>
@ -738,7 +714,7 @@
</action> </action>
<action name="step22a-createPDFsAggregated"> <action name="step22a-createPDFsAggregated">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>createPDFsAggregated.sh</exec> <exec>createPDFsAggregated.sh</exec>
@ -754,7 +730,7 @@
</action> </action>
<action name="step23-finalizeImpalaCluster"> <action name="step23-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec> <exec>finalizeImpalaCluster.sh</exec>
@ -773,7 +749,7 @@
</action> </action>
<action name="Step24-updateCache"> <action name="Step24-updateCache">
<shell xmlns="uri:oozie:shell-action:0.1"> <shell xmlns="uri:oozie:shell-action:0.3">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<exec>updateCache.sh</exec> <exec>updateCache.sh</exec>