alter table project_portfolio add column json json; alter table project_portfolio add column administrative_data json; alter table project_portfolio add column governance_data json; --sections alter table project_portfolio add column executive_summary json; alter table project_portfolio add column final_report_summary json; alter table project_portfolio add column impact json; alter table project_portfolio add column objective json; alter table project_portfolio add column title json; --/sections update project_portfolio set json = convert_from(decode(portfolio, 'base64'), 'UTF8')::json ; update project_portfolio set administrative_data = json->'administrative_data'; update project_portfolio set governance_data = json->'governance_data'; update project_portfolio set executive_summary = json->'sections'->'executive_summary'; update project_portfolio set final_report_summary = json->'sections'->'final_report_summary'; update project_portfolio set impact = json->'sections'->'impact'; update project_portfolio set objective = json->'sections'->'objective'; update project_portfolio set results_in_brief = json->'sections'->'results_in_brief'; update project_portfolio set results = json->'sections'->'results'; update project_portfolio set impact = json->'sections'->'impact'; update project_portfolio set title = json->'sections'->'title'; -- document INSERT INTO DOCUMENT (id, title, abstract, doctype, repository, rights, pubyear) SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_object'), '40|corda_______', '50|fp7___object') AS id, 'Objectives of project ' ||( administrative_data ->> 'acronym' :: TEXT ) AS title, objective ->> 'text' AS abstract, 'project_report' AS doctype, 'CORDIS' AS repository, 'OPEN' :: TEXT AS rights, administrative_data ->> 'date_to' :: TEXT AS pubyear FROM project_portfolio WHERE objective ->> 'text' IS NOT NULL UNION ALL SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020summary'), '40|corda_______', '50|fp7__summary') AS id, 'Final report summary of project ' ||( administrative_data ->> 'acronym' :: TEXT ) AS title, final_report_summary ->> 'text' AS abstract, 'project_report' AS doctype, 'CORDIS' AS repository, 'OPEN' :: TEXT AS rights, administrative_data ->> 'date_to' :: TEXT AS pubyear FROM project_portfolio WHERE final_report_summary ->> 'text' IS NOT NULL UNION ALL SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020___exec'), '40|corda_______', '50|fp7_____exec') AS id, 'Executive summary of project ' ||( administrative_data ->> 'acronym' :: TEXT ) AS title, executive_summary ->> 'text' AS abstract, 'project_report' AS doctype, 'CORDIS' AS repository, 'OPEN' :: TEXT AS rights, administrative_data ->> 'date_to' :: TEXT AS pubyear FROM project_portfolio WHERE executive_summary ->> 'text' IS NOT NULL UNION ALL SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020__brief'), '40|corda_______', '50|fp7____brief') AS id, 'Results in brief of project ' ||( administrative_data ->> 'acronym' :: TEXT ) AS title, results_in_brief ->> 'text' AS abstract, 'project_report' AS doctype, 'CORDIS' AS repository, 'OPEN' :: TEXT AS rights, administrative_data ->> 'date_to' :: TEXT AS pubyear FROM project_portfolio WHERE results_in_brief ->> 'text' IS NOT NULL UNION ALL SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020results'), '40|corda_______', '50|fp7__results') AS id, 'Results of project ' ||( administrative_data ->> 'acronym' :: TEXT ) AS title, results ->> 'text' AS abstract, 'project_report' AS doctype, 'CORDIS' AS repository, 'OPEN' :: TEXT AS rights, administrative_data ->> 'date_to' :: TEXT AS pubyear FROM project_portfolio WHERE results ->> 'text' IS NOT NULL UNION ALL SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_impact'), '40|corda_______', '50|fp7___impact') AS id, 'Impact of project ' ||( administrative_data ->> 'acronym' :: TEXT ) AS title, impact ->> 'text' AS abstract, 'project_report' AS doctype, 'CORDIS' AS repository, 'OPEN' :: TEXT AS rights, administrative_data ->> 'date_to' :: TEXT AS pubyear FROM project_portfolio WHERE impact ->> 'text' IS NOT NULL -- doc_project INSERT INTO doc_project (docid, projectid, inferred) SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_object'), '40|corda_______', '50|fp7___object') AS docid, projectid, TRUE AS inferred FROM project_portfolio WHERE objective ->> 'text' IS NOT NULL UNION ALL SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020summary'), '40|corda_______', '50|fp7__summary') AS docid, projectid, TRUE AS inferred FROM project_portfolio WHERE final_report_summary ->> 'text' IS NOT NULL UNION ALL SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020___exec'), '40|corda_______', '50|fp7_____exec') AS docid, projectid, TRUE AS inferred FROM project_portfolio WHERE executive_summary ->> 'text' IS NOT NULL UNION ALL SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020__brief'), '40|corda_______', '50|fp7____brief') AS docid, projectid, TRUE AS inferred FROM project_portfolio WHERE results_in_brief ->> 'text' IS NOT NULL UNION ALL SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020results'), '40|corda_______', '50|fp7__results') AS docid, projectid, TRUE AS inferred FROM project_portfolio WHERE results ->> 'text' IS NOT NULL UNION ALL SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_impact'), '40|corda_______', '50|fp7___impact') AS docid, projectid, TRUE AS inferred FROM project_portfolio WHERE impact ->> 'text' IS NOT NULL -- updates the project table with data from the project_portfolios update project p set (total_cost,contribution,currency) = ((administrative_data->>'Total cost')::numeric, (administrative_data->>'contribution')::numeric, 'EURO'::text) from project_portfolio pp where pp.projectid = p.id ; update project_organization po set (contribution, currency) = (U.contribution, 'EURO'::text) from ( select projectid, '20|ec__________::'||MD5(o->>'pic') as orgid, (o->>'contribution')::numeric as contribution from ( select projectid, json_array_elements(administrative_data->'coordinators') as o from project_portfolio union all select projectid, json_array_elements(administrative_data->'participants') as o from project_portfolio ) as T ) as U where po.orgid = U.orgid and po.projectid = U.projectid -- include start/end dates from project portfolios update project p set startdate = pp.administrative_data->>'date_from' from project_portfolio pp where p.startdate is null and p.id = pp.projectid ; update project p set enddate = pp.administrative_data->>'date_to' from project_portfolio pp where p.enddate is null and p.id = pp.projectid ; -- work in progress, waiting for ARC to fix the data. For now we keep only the activity types that do not contain any new-line characters update project_organization po set activitytype = o.activitytype from (select '20|ec__________::'||MD5(o->>'pic') as orgid, o->>'activity_type' as activitytype from ( select json_array_elements(administrative_data->'coordinators') as o from project_portfolio union all select json_array_elements(administrative_data->'participants') as o from project_portfolio ) as T where o->>'pic' is not null and o->>'activity_type' !~ E'.*\n.*') o where po.orgid = o.orgid; -- extract PubMed publications from the project portfolios find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | map([.key, .value.ArticleTitle, .value.AbstractText, .value.ArticleDate]) | .[] | @csv' {} \; > ../document_pp.csv find . -name '*.json' -exec bash -c "jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as \$id | .value.Authors | to_entries | .[] | .key as \$i | { docid : \$id, fullname : (.value.LastName+\", \"+.value.ForeName), rank: (map(\$i+1) | unique | .[0]) } ) ' \"{}\" | jq -s -r 'map([.docid, .fullname, .rank]) | .[] | @csv' " \; > ../doc_author_pp.csv find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.OtherIDs | map([$id, .Source, .id ] )) | .[] | @csv ' {} \; > ../doc_other_id.csv find . -name 'FP7*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda_______::\", \$grant ] | @csv " {} \; > ../doc_project_pp.csv 15:50:53 find . -name 'H2020*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda__h2020::\", \$grant ] | @csv " {} \; >> ../doc_project_pp.csv find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.MeshHeadings | map([$id, (group_by(.Label) | .[] )])) | map([.[0], .[1][0].text, ([(.[2][]?.text )] | join("@")) ]) | .[] | @csv ' {} \; > ../doc_subject_pp.csv //DOCUMENTS create table document_pp(id text, title text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN', doctype text default 'publication'); copy document_pp (id, title, abstract, pubyear) from '/Users/claudio/workspace/data/d4i/document_pp.csv' CSV ; create table document_pp_unique as (select distinct * from document_pp ); drop table document_pp; alter table document_pp_unique rename to document_pp ; update document_pp set repository = 'PubMed Central PP' ; update document_pp set pubyear = to_date(pubyear, 'DD/MM/YYYY')::text ; update document_pp set id= '50|pp_______267::'||MD5(id) ; // DOC_AUTHOR create table doc_author_pp (docid text, fullname text, rank integer); copy doc_author_pp (docid, fullname, rank) from '/Users/claudio/workspace/data/d4i/doc_author_pp.csv' CSV ; update doc_author_pp set fullname = SUBSTRING(fullname, 0, length(fullname) + 1 - 2) where fullname like '%, '; create table doc_author_pp_u as (select distinct * from doc_author_pp) ; drop table doc_author_pp; alter table doc_author_pp_u rename to doc_author_pp ; update doc_author_pp set docid = '50|pp_______267::'||MD5(docid) ; // DOC_SUBJECT create table doc_subject_pp(docid text, subject text, typology text); create table subject_tmp(id text, descriptor text, qualifiers text); copy subject_tmp (id, descriptor, qualifiers) from '/Users/claudio/workspace/data/d4i/doc_subject_pp.csv' CSV; insert into doc_subject_pp select '50|pp_______267::'||MD5(id) as docid, s as subject, 'MeshHeadings' as typology from ( select id, d||'|'||q as s from ( select id, descriptor as d, unnest(regexp_split_to_array(qualifiers, '@')) as q from subject_tmp where qualifiers <> '') as t UNION ALL select distinct id, descriptor as s from subject_tmp) as t ; create table doc_subject_pp_u as select distinct * from doc_subject_pp; drop table doc_subject_pp; alter table doc_subject_pp_u rename to doc_subject_pp; // DOC_PROJECT create table doc_project_pp(docid text, projectid text); create table dp_tmp (docid text, profix text, grantid text) ; copy dp_tmp(docid, profix, grantid) from '/Users/claudio/workspace/data/d4i/doc_project_pp.csv' CSV; insert into doc_project_pp select '50|pp_______267::'||MD5(docid), profix||MD5(grantid) from dp_tmp ; // DOC_OTHER_IDENTIFIER create table doc_other_identifier_pp(docid text, idtype text, id text); copy doc_other_identifier_pp (docid, idtype, id) from '/Users/claudio/workspace/data/d4i/doc_other_id.csv' CSV; update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ; update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ; update doc_other_identifier_pp set docid= '50|pp_______267::'||MD5(docid); // Caricamento dei csv sul db, cleaning degli idtype, generazione dei subject (mesh), distinct values, ... create table doc_alias_pp(id text, idpp text); insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> ''; alter table document_pp add column existing_docid text; alter table doc_other_identifier_pp add column existing_docid text; alter table doc_author_pp add column existing_docid text; alter table doc_project_pp add column existing_docid text; alter table doc_subject_pp add column existing_docid text; update document_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp; update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp; update doc_author_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid = doc_alias_pp.idpp; update doc_project_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp; update doc_subject_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp; update document_pp set id = existing_docid where existing_docid is not null; update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null; update doc_author_pp set docid = existing_docid where existing_docid is not null; update doc_project_pp set docid = existing_docid where existing_docid is not null; update doc_subject_pp set docid = existing_docid where existing_docid is not null; alter table document_pp drop column existing_docid ; alter table doc_other_identifier_pp drop column existing_docid ; alter table doc_author_pp drop column existing_docid ; alter table doc_project_pp drop column existing_docid ; alter table doc_subject_pp drop column existing_docid ; -- ONLY FOR MISSING DOCUMENT insert into document (id, title, abstract, doctype, repository, pubyear, rights) select id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp_______267::%'; insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp_______267::%' on conflict do nothing; -- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents) insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid; insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing; insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing; ---------------------------- -- extract RestPublications publications from the project portfolios //--PART 1 - to be run in local find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | map([.key, .value.title, .value.resulttype, .value.description, .value.dateofacceptance]) | .[] | @csv' {} \; > ../document_pp.csv find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.creators | map([$id, .full, .rank])) | .[] | @csv' {} \; > ../doc_author_pp.csv find . -name 'FP7*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda_______::\", \$grant ] | @csv " {} \; > ../doc_project_pp.csv find . -name 'H2020*.json' -exec jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda__h2020::\", \$grant ] | @csv " {} \; >> ../doc_project_pp.csv find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.subjects | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_subject_pp.csv find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.pids | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_other_id.csv //DOCUMENTS drop table if exists document_pp; create table document_pp(id text, title text, doctype text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN'); copy document_pp (id, title, doctype, abstract, pubyear) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/document_pp.csv' CSV ; create table document_pp_unique as (select distinct * from document_pp ); drop table document_pp; alter table document_pp_unique rename to document_pp ; update document_pp set repository = 'Rest Publications PP' ; update document_pp set id= '50|pp__restpubs::'||MD5(id); // DOC_AUTHOR drop table if exists doc_author_pp; create table doc_author_pp (docid text, fullname text, rank integer); copy doc_author_pp (docid, fullname, rank) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_author_pp.csv' CSV ; create table doc_author_pp_u as (select distinct * from doc_author_pp) ; drop table doc_author_pp; alter table doc_author_pp_u rename to doc_author_pp ; update doc_author_pp set docid = '50|pp__restpubs::'||MD5(docid) ; // DOC_PROJECT drop table if exists doc_project_pp; create table doc_project_pp(docid text, projectid text); create table dp_tmp (docid text, prefix text, grantid text) ; copy dp_tmp(docid, prefix, grantid) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_project_pp.csv' CSV; insert into doc_project_pp select distinct '50|pp__restpubs::'||MD5(docid), prefix||MD5(grantid) from dp_tmp ; // DOC_SUBJECT drop table if exists doc_subject_pp; create table doc_subject_pp(docid text, subject text, typology text); copy doc_subject_pp (docid, subject, typology) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_subject_pp.csv' CSV; delete from doc_subject_pp where subject is null OR subject = ''; create table doc_subject_pp_u as select distinct * from doc_subject_pp; drop table doc_subject_pp; alter table doc_subject_pp_u rename to doc_subject_pp; update doc_subject_pp set docid = '50|pp__restpubs::'||MD5(docid) ; // DOC_OTHER_IDENTIFIER drop table if exists doc_other_identifier_pp; create table doc_other_identifier_pp(docid text, id text, idtype text); copy doc_other_identifier_pp (docid, id, idtype) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_other_id.csv' CSV; delete from doc_other_identifier_pp where id is null OR id = ''; create table doc_other_identifier_pp_u as select distinct * from doc_other_identifier_pp; drop table doc_other_identifier_pp; alter table doc_other_identifier_pp_u rename to doc_other_identifier_pp; update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ; update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ; update doc_other_identifier_pp set docid = '50|pp__restpubs::'||MD5(docid); // -- PART 2 - to be run on the server create table doc_alias_pp(id text, idpp text); insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> ''; alter table document_pp add column existing_docid text; alter table doc_other_identifier_pp add column existing_docid text; alter table doc_author_pp add column existing_docid text; alter table doc_project_pp add column existing_docid text; alter table doc_subject_pp add column existing_docid text; update document_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp; update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp; update doc_author_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid = doc_alias_pp.idpp; update doc_project_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp; update doc_subject_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp; update document_pp set id = existing_docid where existing_docid is not null; update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null; update doc_author_pp set docid = existing_docid where existing_docid is not null; update doc_project_pp set docid = existing_docid where existing_docid is not null; update doc_subject_pp set docid = existing_docid where existing_docid is not null; alter table document_pp drop column existing_docid ; alter table doc_other_identifier_pp drop column existing_docid ; alter table doc_author_pp drop column existing_docid ; alter table doc_project_pp drop column existing_docid ; alter table doc_subject_pp drop column existing_docid ; -- ONLY FOR MISSING DOCUMENT insert into document (id, title, abstract, doctype, repository, pubyear, rights) select distinct id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp__restpubs::%'; insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp__restpubs::%' on conflict do nothing; -- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents) insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid; insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing; insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;