forked from D-Net/dnet-hadoop
convert_hive_to_spark_actions #1
|
@ -17,4 +17,4 @@ FROM ${stats_db_name}.result_projects,
|
|||
${stats_db_name}.project
|
||||
WHERE result_projects.id = result.id
|
||||
AND result.type = 'publication'
|
||||
AND project.id = result_projects.project; -- /*EOS*/
|
||||
AND project.id = result_projects.project; /*EOS*/
|
|
@ -66,4 +66,4 @@ DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
|
|||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
|
||||
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
|
||||
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
|
||||
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE;
|
||||
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/
|
||||
|
|
|
@ -135,9 +135,9 @@ with
|
|||
lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
|
||||
select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
|
||||
from lvl1
|
||||
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
|
||||
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
|
||||
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
|
||||
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
|
||||
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
|
||||
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); /*EOS*/
|
||||
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/
|
||||
|
|
|
@ -41,31 +41,31 @@ DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
|
|||
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
|
||||
SELECT substr(d.id, 4) AS id, langs.languages AS language
|
||||
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
|
||||
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/
|
||||
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; -- /*EOS*/
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
|
||||
SELECT substr(d.id, 4) AS id, oids.ids AS oid
|
||||
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
|
||||
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; -- /*EOS*/
|
||||
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; -- /*EOS*/
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||
FROM ${openaire_db_name}.relation r
|
||||
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; -- /*EOS*/
|
||||
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
-- datasource sources:
|
||||
-- where the datasource info have been collected from.
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; -- /*EOS*/
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
|
||||
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
|
||||
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
||||
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; -- /*EOS*/
|
||||
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
|
||||
SELECT datasource AS id, id AS result
|
||||
FROM ${stats_db_name}.result_datasources; -- /*EOS*/
|
||||
FROM ${stats_db_name}.result_datasources; /*EOS*/
|
||||
|
|
|
@ -307,7 +307,7 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="Step8">
|
||||
<action name="Step8">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
|
@ -315,29 +315,18 @@
|
|||
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
|
||||
<jar>dhp-stats-update-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
</spark-opts>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg>
|
||||
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
|
||||
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql</arg>
|
||||
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
|
||||
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
|
||||
</spark>
|
||||
<ok to="Step9"/>
|
||||
<error to="Kill"/>
|
||||
</action>-->
|
||||
|
||||
<action name="Step8">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step8.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step9"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step9">
|
||||
|
@ -381,12 +370,11 @@
|
|||
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
|
||||
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
|
||||
</spark>
|
||||
<!-- <ok to="Step11"/>-->
|
||||
<ok to="End"/>
|
||||
<ok to="Step11"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="Step11">
|
||||
<action name="Step11">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
|
@ -394,31 +382,19 @@
|
|||
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
|
||||
<jar>dhp-stats-update-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
</spark-opts>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg>
|
||||
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
|
||||
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
|
||||
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step11.sql</arg>
|
||||
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
|
||||
<arg>--openaire_db_name</arg><arg>${openaire_db_name}</arg>
|
||||
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
|
||||
</spark>
|
||||
<ok to="Step12"/>
|
||||
<error to="Kill"/>
|
||||
</action>-->
|
||||
|
||||
<action name="Step11">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step11.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>external_stats_db_name=${external_stats_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step12"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step12">
|
||||
|
@ -533,7 +509,7 @@
|
|||
</action>
|
||||
|
||||
<action name="Contexts">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>contexts.sh</exec>
|
||||
|
@ -624,7 +600,7 @@
|
|||
</action>
|
||||
|
||||
<action name="Step19-finalize">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>finalizedb.sh</exec>
|
||||
|
@ -637,7 +613,7 @@
|
|||
</action>
|
||||
|
||||
<action name="step20-createMonitorDB">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>monitor.sh</exec>
|
||||
|
@ -670,7 +646,7 @@
|
|||
<!-- </action>-->
|
||||
|
||||
<action name="step21-createObservatoryDB-pre">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>observatory-pre.sh</exec>
|
||||
|
@ -706,7 +682,7 @@
|
|||
</action>
|
||||
|
||||
<action name="step21-createObservatoryDB-post">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>observatory-post.sh</exec>
|
||||
|
@ -719,7 +695,7 @@
|
|||
</action>
|
||||
|
||||
<action name="step22-copyDataToImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>copyDataToImpalaCluster.sh</exec>
|
||||
|
@ -738,7 +714,7 @@
|
|||
</action>
|
||||
|
||||
<action name="step22a-createPDFsAggregated">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>createPDFsAggregated.sh</exec>
|
||||
|
@ -754,7 +730,7 @@
|
|||
</action>
|
||||
|
||||
<action name="step23-finalizeImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>finalizeImpalaCluster.sh</exec>
|
||||
|
@ -773,7 +749,7 @@
|
|||
</action>
|
||||
|
||||
<action name="Step24-updateCache">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.3">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>updateCache.sh</exec>
|
||||
|
|
Loading…
Reference in New Issue