Contexts are now downloaded instead of using the stats_ext db

This commit is contained in:
Antonis Lempesis 2021-03-04 00:42:21 +02:00
parent 27796343ca
commit c5fbad8093
3 changed files with 50 additions and 13 deletions

View File

@ -0,0 +1,33 @@
#!/usr/bin/env bash
CONTEXT_API=$1
TARGET_DB=$2
TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 </dev/urandom | head -c 6`
echo "Downloading context data"
curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv
cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv
echo "uploading context data to hdfs"
hdfs dfs -mkdir ${TMP}
hdfs dfs -copyFromLocal contexts.csv ${TMP}
hdfs dfs -copyFromLocal categories.csv ${TMP}
hdfs dfs -copyFromLocal concepts.csv ${TMP}
hdfs dfs -chmod -R 777 ${TMP}
echo "Creating and populating impala tables"
impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';"
impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';"
impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';"
impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;"
impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;"
impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;"
echo "Cleaning up"
hdfs dfs -rm -f -r -skipTrash ${TMP}
echo "Finito!"

View File

@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
SELECT *
FROM ${external_stats_db_name}.rndexpediture;
CREATE OR REPLACE VIEW ${stats_db_name}.context AS
SELECT *
FROM ${external_stats_db_name}.context;
CREATE OR REPLACE VIEW ${stats_db_name}.category AS
SELECT *
FROM ${external_stats_db_name}.category;
CREATE OR REPLACE VIEW ${stats_db_name}.concept AS
SELECT *
FROM ${external_stats_db_name}.concept;
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
-- Creation date of the database

View File

@ -41,6 +41,10 @@
<name>hive_timeout</name>
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
</property>
<property>
<name>context_api_url</name>
<description>the base url of the context api (https://services.openaire.eu/openaire)</description>
</property>
</parameters>
<global>
@ -264,6 +268,19 @@
<error to="Kill"/>
</action>
<action name="Step17">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>contexts.sh</exec>
<argument>${context_api_url}</argument>
<argument>${stats_db_name}</argument>
<file>contexts.sh</file>
</shell>
<ok to="step20-createMonitorDB"/>
<error to="Kill"/>
</action>
<action name="Step19">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>