assigning correctly hive contexts to concepts

This commit is contained in:
Antonis Lempesis 2021-03-05 14:12:18 +02:00
parent c5fbad8093
commit 6147ee4950
5 changed files with 21 additions and 6 deletions

View File

@ -9,8 +9,8 @@ echo "Downloading context data"
curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv
cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv
cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
echo "uploading context data to hdfs" echo "uploading context data to hdfs"
hdfs dfs -mkdir ${TMP} hdfs dfs -mkdir ${TMP}
@ -29,5 +29,8 @@ impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.
echo "Cleaning up" echo "Cleaning up"
hdfs dfs -rm -f -r -skipTrash ${TMP} hdfs dfs -rm -f -r -skipTrash ${TMP}
rm concepts.csv
rm categories.csv
rm contexts.csv
echo "Finito!" echo "Finito!"

View File

@ -47,7 +47,10 @@ from ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.publication_concepts AS CREATE TABLE ${stats_db_name}.publication_concepts AS
SELECT substr(p.id, 4) as id, contexts.context.id as concept SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.publication p from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;

View File

@ -54,7 +54,10 @@ FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.dataset_concepts AS CREATE TABLE ${stats_db_name}.dataset_concepts AS
SELECT substr(p.id, 4) as id, contexts.context.id as concept SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.dataset p from ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.context) contexts as context LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;

View File

@ -54,7 +54,10 @@ FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.software_concepts AS CREATE TABLE ${stats_db_name}.software_concepts AS
SELECT substr(p.id, 4) AS id, contexts.context.id AS concept SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.software p FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.context) contexts AS context LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;

View File

@ -52,7 +52,10 @@ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS
SELECT substr(p.id, 4) AS id, contexts.context.id AS concept SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false; where p.datainfo.deletedbyinference = false;