data4impact/apps/data4impact-import-scripts/scripts/project_portfolios/additional_scripts.sql

382 lines
23 KiB
SQL

alter table project_portfolio add column json json;
alter table project_portfolio add column administrative_data json;
alter table project_portfolio add column governance_data json;
--sections
alter table project_portfolio add column executive_summary json;
alter table project_portfolio add column final_report_summary json;
alter table project_portfolio add column impact json;
alter table project_portfolio add column objective json;
alter table project_portfolio add column title json;
--/sections
update project_portfolio set json = convert_from(decode(portfolio, 'base64'), 'UTF8')::json ;
update project_portfolio set administrative_data = json->'administrative_data';
update project_portfolio set governance_data = json->'governance_data';
update project_portfolio set executive_summary = json->'sections'->'executive_summary';
update project_portfolio set final_report_summary = json->'sections'->'final_report_summary';
update project_portfolio set impact = json->'sections'->'impact';
update project_portfolio set objective = json->'sections'->'objective';
update project_portfolio set results_in_brief = json->'sections'->'results_in_brief';
update project_portfolio set results = json->'sections'->'results';
update project_portfolio set impact = json->'sections'->'impact';
update project_portfolio set title = json->'sections'->'title';
-- document
INSERT INTO DOCUMENT
(id,
title,
abstract,
doctype,
repository,
rights,
pubyear)
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_object'),
'40|corda_______', '50|fp7___object') AS id,
'Objectives of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
objective ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE objective ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020summary'),
'40|corda_______', '50|fp7__summary') AS id,
'Final report summary of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
final_report_summary ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE final_report_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020___exec'),
'40|corda_______', '50|fp7_____exec') AS id,
'Executive summary of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
executive_summary ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE executive_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020__brief'),
'40|corda_______', '50|fp7____brief') AS id,
'Results in brief of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
results_in_brief ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE results_in_brief ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020results'),
'40|corda_______', '50|fp7__results') AS id,
'Results of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
results ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE results ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_impact'),
'40|corda_______', '50|fp7___impact') AS id,
'Impact of project '
||( administrative_data ->> 'acronym' :: TEXT ) AS title,
impact ->> 'text' AS abstract,
'project_report' AS doctype,
'CORDIS' AS repository,
'OPEN' :: TEXT AS rights,
administrative_data ->> 'date_to' :: TEXT AS pubyear
FROM project_portfolio
WHERE impact ->> 'text' IS NOT NULL
-- doc_project
INSERT INTO doc_project
(docid,
projectid,
inferred)
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_object'),
'40|corda_______', '50|fp7___object') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE objective ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020summary'),
'40|corda_______', '50|fp7__summary') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE final_report_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020___exec'),
'40|corda_______', '50|fp7_____exec') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE executive_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020__brief'),
'40|corda_______', '50|fp7____brief') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE results_in_brief ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020results'),
'40|corda_______', '50|fp7__results') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE results ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_impact'),
'40|corda_______', '50|fp7___impact') AS docid,
projectid,
TRUE AS inferred
FROM project_portfolio
WHERE impact ->> 'text' IS NOT NULL
-- updates the project table with data from the project_portfolios
update project p set (total_cost,contribution,currency) = ((administrative_data->>'Total cost')::numeric, (administrative_data->>'contribution')::numeric, 'EURO'::text) from
project_portfolio pp where pp.projectid = p.id ;
update project_organization po set (contribution, currency) = (U.contribution, 'EURO'::text) from
(
select projectid, '20|ec__________::'||MD5(o->>'pic') as orgid, (o->>'contribution')::numeric as contribution from
(
select projectid, json_array_elements(administrative_data->'coordinators') as o from project_portfolio
union all
select projectid, json_array_elements(administrative_data->'participants') as o from project_portfolio
) as T
) as U where po.orgid = U.orgid and po.projectid = U.projectid
-- include start/end dates from project portfolios
update project p set startdate = pp.administrative_data->>'date_from' from project_portfolio pp where p.startdate is null and p.id = pp.projectid ;
update project p set enddate = pp.administrative_data->>'date_to' from project_portfolio pp where p.enddate is null and p.id = pp.projectid ;
-- work in progress, waiting for ARC to fix the data. For now we keep only the activity types that do not contain any new-line characters
update project_organization po set activitytype = o.activitytype from (select '20|ec__________::'||MD5(o->>'pic') as orgid, o->>'activity_type' as activitytype from
(
select json_array_elements(administrative_data->'coordinators') as o from project_portfolio
union all
select json_array_elements(administrative_data->'participants') as o from project_portfolio
) as T
where o->>'pic' is not null and o->>'activity_type' !~ E'.*\n.*') o where po.orgid = o.orgid;
-- extract PubMed publications from the project portfolios
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | map([.key, .value.ArticleTitle, .value.AbstractText, .value.ArticleDate]) | .[] | @csv' {} \; > ../document_pp.csv
find . -name '*.json' -exec bash -c "jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as \$id | .value.Authors | to_entries | .[] | .key as \$i | { docid : \$id, fullname : (.value.LastName+\", \"+.value.ForeName), rank: (map(\$i+1) | unique | .[0]) } ) ' \"{}\" | jq -s -r 'map([.docid, .fullname, .rank]) | .[] | @csv' " \; > ../doc_author_pp.csv
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.OtherIDs | map([$id, .Source, .id ] )) | .[] | @csv ' {} \; > ../doc_other_id.csv
find . -name 'FP7*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda_______::\", \$grant ] | @csv " {} \; > ../doc_project_pp.csv 15:50:53
find . -name 'H2020*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda__h2020::\", \$grant ] | @csv " {} \; >> ../doc_project_pp.csv
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.MeshHeadings | map([$id, (group_by(.Label) | .[] )])) | map([.[0], .[1][0].text, ([(.[2][]?.text )] | join("@")) ]) | .[] | @csv ' {} \; > ../doc_subject_pp.csv
//DOCUMENTS
create table document_pp(id text, title text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN', doctype text default 'publication');
copy document_pp (id, title, abstract, pubyear) from '/Users/claudio/workspace/data/d4i/document_pp.csv' CSV ;
create table document_pp_unique as (select distinct * from document_pp );
drop table document_pp;
alter table document_pp_unique rename to document_pp ;
update document_pp set repository = 'PubMed Central PP' ;
update document_pp set pubyear = to_date(pubyear, 'DD/MM/YYYY')::text ;
update document_pp set id= '50|pp_______267::'||MD5(id) ;
// DOC_AUTHOR
create table doc_author_pp (docid text, fullname text, rank integer);
copy doc_author_pp (docid, fullname, rank) from '/Users/claudio/workspace/data/d4i/doc_author_pp.csv' CSV ;
update doc_author_pp set fullname = SUBSTRING(fullname, 0, length(fullname) + 1 - 2) where fullname like '%, ';
create table doc_author_pp_u as (select distinct * from doc_author_pp) ;
drop table doc_author_pp;
alter table doc_author_pp_u rename to doc_author_pp ;
update doc_author_pp set docid = '50|pp_______267::'||MD5(docid) ;
// DOC_SUBJECT
create table doc_subject_pp(docid text, subject text, typology text);
create table subject_tmp(id text, descriptor text, qualifiers text);
copy subject_tmp (id, descriptor, qualifiers) from '/Users/claudio/workspace/data/d4i/doc_subject_pp.csv' CSV;
insert into doc_subject_pp select '50|pp_______267::'||MD5(id) as docid, s as subject, 'MeshHeadings' as typology from ( select id, d||'|'||q as s from ( select id, descriptor as d, unnest(regexp_split_to_array(qualifiers, '@')) as q from subject_tmp where qualifiers <> '') as t UNION ALL select distinct id, descriptor as s from subject_tmp) as t ;
create table doc_subject_pp_u as select distinct * from doc_subject_pp;
drop table doc_subject_pp;
alter table doc_subject_pp_u rename to doc_subject_pp;
// DOC_PROJECT
create table doc_project_pp(docid text, projectid text);
create table dp_tmp (docid text, profix text, grantid text) ;
copy dp_tmp(docid, profix, grantid) from '/Users/claudio/workspace/data/d4i/doc_project_pp.csv' CSV;
insert into doc_project_pp select '50|pp_______267::'||MD5(docid), profix||MD5(grantid) from dp_tmp ;
// DOC_OTHER_IDENTIFIER
create table doc_other_identifier_pp(docid text, idtype text, id text);
copy doc_other_identifier_pp (docid, idtype, id) from '/Users/claudio/workspace/data/d4i/doc_other_id.csv' CSV;
update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ;
update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ;
update doc_other_identifier_pp set docid= '50|pp_______267::'||MD5(docid);
// Caricamento dei csv sul db, cleaning degli idtype, generazione dei subject (mesh), distinct values, ...
create table doc_alias_pp(id text, idpp text);
insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> '';
alter table document_pp add column existing_docid text;
alter table doc_other_identifier_pp add column existing_docid text;
alter table doc_author_pp add column existing_docid text;
alter table doc_project_pp add column existing_docid text;
alter table doc_subject_pp add column existing_docid text;
update document_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp;
update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp;
update doc_author_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid = doc_alias_pp.idpp;
update doc_project_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp;
update doc_subject_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp;
update document_pp set id = existing_docid where existing_docid is not null;
update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null;
update doc_author_pp set docid = existing_docid where existing_docid is not null;
update doc_project_pp set docid = existing_docid where existing_docid is not null;
update doc_subject_pp set docid = existing_docid where existing_docid is not null;
alter table document_pp drop column existing_docid ;
alter table doc_other_identifier_pp drop column existing_docid ;
alter table doc_author_pp drop column existing_docid ;
alter table doc_project_pp drop column existing_docid ;
alter table doc_subject_pp drop column existing_docid ;
-- ONLY FOR MISSING DOCUMENT
insert into document (id, title, abstract, doctype, repository, pubyear, rights) select id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp_______267::%';
insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp_______267::%' on conflict do nothing;
-- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents)
insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid;
insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing;
insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;
----------------------------
-- extract RestPublications publications from the project portfolios
//--PART 1 - to be run in local
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | map([.key, .value.title, .value.resulttype, .value.description, .value.dateofacceptance]) | .[] | @csv' {} \; > ../document_pp.csv
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.creators | map([$id, .full, .rank])) | .[] | @csv' {} \; > ../doc_author_pp.csv
find . -name 'FP7*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda_______::\", \$grant ] | @csv " {} \; > ../doc_project_pp.csv
find . -name 'H2020*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda__h2020::\", \$grant ] | @csv " {} \; >> ../doc_project_pp.csv
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.subjects | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_subject_pp.csv
find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.pids | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_other_id.csv
//DOCUMENTS
drop table if exists document_pp;
create table document_pp(id text, title text, doctype text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN');
copy document_pp (id, title, doctype, abstract, pubyear) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/document_pp.csv' CSV ;
create table document_pp_unique as (select distinct * from document_pp );
drop table document_pp;
alter table document_pp_unique rename to document_pp ;
update document_pp set repository = 'Rest Publications PP' ;
update document_pp set id= '50|pp__restpubs::'||MD5(id);
// DOC_AUTHOR
drop table if exists doc_author_pp;
create table doc_author_pp (docid text, fullname text, rank integer);
copy doc_author_pp (docid, fullname, rank) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_author_pp.csv' CSV ;
create table doc_author_pp_u as (select distinct * from doc_author_pp) ;
drop table doc_author_pp;
alter table doc_author_pp_u rename to doc_author_pp ;
update doc_author_pp set docid = '50|pp__restpubs::'||MD5(docid) ;
// DOC_PROJECT
drop table if exists doc_project_pp;
create table doc_project_pp(docid text, projectid text);
create table dp_tmp (docid text, prefix text, grantid text) ;
copy dp_tmp(docid, prefix, grantid) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_project_pp.csv' CSV;
insert into doc_project_pp select distinct '50|pp__restpubs::'||MD5(docid), prefix||MD5(grantid) from dp_tmp ;
// DOC_SUBJECT
drop table if exists doc_subject_pp;
create table doc_subject_pp(docid text, subject text, typology text);
copy doc_subject_pp (docid, subject, typology) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_subject_pp.csv' CSV;
delete from doc_subject_pp where subject is null OR subject = '';
create table doc_subject_pp_u as select distinct * from doc_subject_pp;
drop table doc_subject_pp;
alter table doc_subject_pp_u rename to doc_subject_pp;
update doc_subject_pp set docid = '50|pp__restpubs::'||MD5(docid) ;
// DOC_OTHER_IDENTIFIER
drop table if exists doc_other_identifier_pp;
create table doc_other_identifier_pp(docid text, id text, idtype text);
copy doc_other_identifier_pp (docid, id, idtype) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_other_id.csv' CSV;
delete from doc_other_identifier_pp where id is null OR id = '';
create table doc_other_identifier_pp_u as select distinct * from doc_other_identifier_pp;
drop table doc_other_identifier_pp;
alter table doc_other_identifier_pp_u rename to doc_other_identifier_pp;
update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ;
update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ;
update doc_other_identifier_pp set docid = '50|pp__restpubs::'||MD5(docid);
// -- PART 2 - to be run on the server
create table doc_alias_pp(id text, idpp text);
insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> '';
alter table document_pp add column existing_docid text;
alter table doc_other_identifier_pp add column existing_docid text;
alter table doc_author_pp add column existing_docid text;
alter table doc_project_pp add column existing_docid text;
alter table doc_subject_pp add column existing_docid text;
update document_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp;
update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp;
update doc_author_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid = doc_alias_pp.idpp;
update doc_project_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp;
update doc_subject_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp;
update document_pp set id = existing_docid where existing_docid is not null;
update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null;
update doc_author_pp set docid = existing_docid where existing_docid is not null;
update doc_project_pp set docid = existing_docid where existing_docid is not null;
update doc_subject_pp set docid = existing_docid where existing_docid is not null;
alter table document_pp drop column existing_docid ;
alter table doc_other_identifier_pp drop column existing_docid ;
alter table doc_author_pp drop column existing_docid ;
alter table doc_project_pp drop column existing_docid ;
alter table doc_subject_pp drop column existing_docid ;
-- ONLY FOR MISSING DOCUMENT
insert into document (id, title, abstract, doctype, repository, pubyear, rights) select distinct id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp__restpubs::%';
insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp__restpubs::%' on conflict do nothing;
-- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents)
insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid;
insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing;
insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;