From c5fbad8093ca27deebf1b5fd5ffd39e1877c533d Mon Sep 17 00:00:00 2001 From: antleb Date: Thu, 4 Mar 2021 00:42:21 +0200 Subject: [PATCH] Contexts are now downloaded instead of using the stats_ext db --- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 33 +++++++++++++++++++ .../graph/stats/oozie_app/scripts/step10.sql | 13 -------- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 17 ++++++++++ 3 files changed, 50 insertions(+), 13 deletions(-) create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh new file mode 100644 index 000000000..f06a43bb4 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +CONTEXT_API=$1 +TARGET_DB=$2 + +TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 contexts.csv +cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv +cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv +cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv +cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv + +echo "uploading context data to hdfs" +hdfs dfs -mkdir ${TMP} +hdfs dfs -copyFromLocal contexts.csv ${TMP} +hdfs dfs -copyFromLocal categories.csv ${TMP} +hdfs dfs -copyFromLocal concepts.csv ${TMP} +hdfs dfs -chmod -R 777 ${TMP} + +echo "Creating and populating impala tables" +impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';" +impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';" +impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';" +impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;" +impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;" +impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;" + +echo "Cleaning up" +hdfs dfs -rm -f -r -skipTrash ${TMP} + +echo "Finito!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 6c96317e6..77fbd3b18 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture; -CREATE OR REPLACE VIEW ${stats_db_name}.context AS -SELECT * -FROM ${external_stats_db_name}.context; - -CREATE OR REPLACE VIEW ${stats_db_name}.category AS -SELECT * -FROM ${external_stats_db_name}.category; - -CREATE OR REPLACE VIEW ${stats_db_name}.concept AS -SELECT * -FROM ${external_stats_db_name}.concept; - - ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 9c16f149d..afb10c419 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -41,6 +41,10 @@ hive_timeout the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + context_api_url + the base url of the context api (https://services.openaire.eu/openaire) + @@ -263,6 +267,19 @@ + + + + ${jobTracker} + ${nameNode} + contexts.sh + ${context_api_url} + ${stats_db_name} + contexts.sh + + + +