data4impact/apps/data4impact-import-scripts/scripts/project_portfolios/additional_scripts.sql


alter table project_portfolio add column json json;
alter table project_portfolio add column administrative_data json;
alter table project_portfolio add column governance_data json;

--sections
alter table project_portfolio add column executive_summary json;
alter table project_portfolio add column final_report_summary json;
alter table project_portfolio add column impact json;
alter table project_portfolio add column objective json;
alter table project_portfolio add column title json;
--/sections

update project_portfolio set json = convert_from(decode(portfolio, 'base64'), 'UTF8')::json ;
update project_portfolio set administrative_data    = json->'administrative_data';
update project_portfolio set governance_data        = json->'governance_data';
update project_portfolio set executive_summary      = json->'sections'->'executive_summary';
update project_portfolio set final_report_summary   = json->'sections'->'final_report_summary';
update project_portfolio set impact                 = json->'sections'->'impact';
update project_portfolio set objective              = json->'sections'->'objective';

update project_portfolio set results_in_brief       = json->'sections'->'results_in_brief';
update project_portfolio set results                = json->'sections'->'results';
update project_portfolio set impact                 = json->'sections'->'impact';

update project_portfolio set title                  = json->'sections'->'title';

-- document
INSERT INTO DOCUMENT
            (id,
             title,
             abstract,
             doctype,
             repository,
             rights,
             pubyear)
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_object'),
              '40|corda_______', '50|fp7___object')    AS id,
       'Objectives of project '
       ||( administrative_data ->> 'acronym' :: TEXT ) AS title,
       objective ->> 'text'                            AS abstract,
       'project_report'                                AS doctype,
       'CORDIS'                                        AS repository,
       'OPEN' :: TEXT                                  AS rights,
       administrative_data ->> 'date_to' :: TEXT       AS pubyear
FROM   project_portfolio
WHERE objective ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020summary'),
              '40|corda_______', '50|fp7__summary')    AS id,
       'Final report summary of project '
       ||( administrative_data ->> 'acronym' :: TEXT ) AS title,
       final_report_summary ->> 'text'                 AS abstract,
       'project_report'                                AS doctype,
       'CORDIS'                                        AS repository,
       'OPEN' :: TEXT                                  AS rights,
       administrative_data ->> 'date_to' :: TEXT       AS pubyear
FROM   project_portfolio
WHERE final_report_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020___exec'),
              '40|corda_______', '50|fp7_____exec')    AS id,
       'Executive summary of project '
       ||( administrative_data ->> 'acronym' :: TEXT ) AS title,
       executive_summary ->> 'text'                    AS abstract,
       'project_report'                                AS doctype,
       'CORDIS'                                        AS repository,
       'OPEN' :: TEXT                                  AS rights,
       administrative_data ->> 'date_to' :: TEXT       AS pubyear
FROM   project_portfolio
WHERE executive_summary ->> 'text' IS NOT NULL

UNION ALL

SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020__brief'),
              '40|corda_______', '50|fp7____brief')    AS id,
       'Results in brief of project '
       ||( administrative_data ->> 'acronym' :: TEXT ) AS title,
       results_in_brief ->> 'text'                     AS abstract,
       'project_report'                                AS doctype,
       'CORDIS'                                        AS repository,
       'OPEN' :: TEXT                                  AS rights,
       administrative_data ->> 'date_to' :: TEXT       AS pubyear
FROM   project_portfolio
WHERE results_in_brief ->> 'text' IS NOT NULL

UNION ALL

SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020results'),
              '40|corda_______', '50|fp7__results')    AS id,
       'Results of project '
       ||( administrative_data ->> 'acronym' :: TEXT ) AS title,
       results ->> 'text'                              AS abstract,
       'project_report'                                AS doctype,
       'CORDIS'                                        AS repository,
       'OPEN' :: TEXT                                  AS rights,
       administrative_data ->> 'date_to' :: TEXT       AS pubyear
FROM   project_portfolio
WHERE results ->> 'text' IS NOT NULL

UNION ALL

SELECT Replace(Replace(projectid, '40|corda__h2020', '50|h2020_impact'),
              '40|corda_______', '50|fp7___impact')    AS id,
       'Impact of project '
       ||( administrative_data ->> 'acronym' :: TEXT ) AS title,
       impact ->> 'text'                               AS abstract,
       'project_report'                                AS doctype,
       'CORDIS'                                        AS repository,
       'OPEN' :: TEXT                                  AS rights,
       administrative_data ->> 'date_to' :: TEXT       AS pubyear
FROM   project_portfolio
WHERE impact ->> 'text' IS NOT NULL


-- doc_project
INSERT INTO doc_project
            (docid,
             projectid,
             inferred)
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_object'),
              '40|corda_______', '50|fp7___object') AS docid,
       projectid,
       TRUE                                         AS inferred
FROM   project_portfolio
WHERE objective ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020summary'),
              '40|corda_______', '50|fp7__summary') AS docid,
       projectid,
       TRUE                                         AS inferred
FROM   project_portfolio
WHERE final_report_summary ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020___exec'),
              '40|corda_______', '50|fp7_____exec') AS docid,
       projectid,
       TRUE                                         AS inferred
FROM   project_portfolio
WHERE executive_summary ->> 'text' IS NOT NULL


UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020__brief'),
              '40|corda_______', '50|fp7____brief') AS docid,
       projectid,
       TRUE                                         AS inferred
FROM   project_portfolio
WHERE results_in_brief ->> 'text' IS NOT NULL

UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020results'),
              '40|corda_______', '50|fp7__results') AS docid,
       projectid,
       TRUE                                         AS inferred
FROM   project_portfolio
WHERE results ->> 'text' IS NOT NULL
UNION ALL
SELECT REPLACE(REPLACE(projectid, '40|corda__h2020', '50|h2020_impact'),
              '40|corda_______', '50|fp7___impact') AS docid,
       projectid,
       TRUE                                         AS inferred
FROM   project_portfolio
WHERE impact ->> 'text' IS NOT NULL

-- updates the project table with data from the project_portfolios
update project p set (total_cost,contribution,currency) = ((administrative_data->>'Total cost')::numeric, (administrative_data->>'contribution')::numeric, 'EURO'::text) from
project_portfolio pp where pp.projectid = p.id ;

update project_organization po set (contribution, currency) = (U.contribution, 'EURO'::text) from
(
	select projectid, '20|ec__________::'||MD5(o->>'pic') as orgid, (o->>'contribution')::numeric as contribution from
		(
			select projectid, json_array_elements(administrative_data->'coordinators') as o from project_portfolio
			union all
			select projectid, json_array_elements(administrative_data->'participants') as o from project_portfolio
		) as T
) as U where po.orgid = U.orgid and po.projectid = U.projectid

-- include start/end dates from project portfolios
update project p set startdate = pp.administrative_data->>'date_from' from project_portfolio pp where p.startdate is null and p.id = pp.projectid ;
update project p set enddate = pp.administrative_data->>'date_to' from project_portfolio pp where p.enddate is null and p.id = pp.projectid ;


-- work in progress, waiting for ARC to fix the data. For now we keep only the activity types that do not contain any new-line characters
update project_organization po set activitytype = o.activitytype from (select '20|ec__________::'||MD5(o->>'pic') as orgid, o->>'activity_type' as activitytype from
(
select json_array_elements(administrative_data->'coordinators') as o from project_portfolio
union all
select json_array_elements(administrative_data->'participants') as o from project_portfolio
) as T
where o->>'pic' is not null and o->>'activity_type' !~ E'.*\n.*') o where po.orgid = o.orgid;


-- extract PubMed publications from the project portfolios
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | map([.key, .value.ArticleTitle, .value.AbstractText, .value.ArticleDate]) | .[] | @csv' {} \; > ../document_pp.csv
find . -name '*.json' -exec bash -c "jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as \$id | .value.Authors | to_entries | .[] | .key as \$i | { docid : \$id, fullname : (.value.LastName+\", \"+.value.ForeName), rank: (map(\$i+1) | unique | .[0]) }  ) ' \"{}\" | jq -s -r 'map([.docid, .fullname, .rank]) | .[] | @csv'  " \; > ../doc_author_pp.csv
find . -name '*.json' -exec  jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.OtherIDs | map([$id, .Source, .id ] )) | .[] | @csv ' {} \; > ../doc_other_id.csv
find . -name 'FP7*.json' -exec  jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda_______::\",  \$grant ] | @csv  " {} \; > ../doc_project_pp.csv                                 15:50:53
find . -name 'H2020*.json' -exec  jq -r ".administrative_data.project_id as \$grant | .publications.pubmed_abstracts | to_entries | .[] | [ .key, \"40|corda__h2020::\",  \$grant ] | @csv  " {} \; >> ../doc_project_pp.csv
find . -name '*.json' -exec jq -r '.publications.pubmed_abstracts | to_entries | .[] | (.key as $id | .value.MeshHeadings | map([$id,  (group_by(.Label) | .[] )])) | map([.[0], .[1][0].text, ([(.[2][]?.text )] | join("@"))  ]) | .[] | @csv ' {} \;  > ../doc_subject_pp.csv

//DOCUMENTS
create table document_pp(id text, title text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN', doctype text default 'publication');
copy document_pp (id, title, abstract, pubyear) from '/Users/claudio/workspace/data/d4i/document_pp.csv' CSV ;
create table document_pp_unique as (select distinct * from document_pp );
drop table document_pp;
alter table document_pp_unique rename to document_pp ;
update document_pp set repository = 'PubMed Central PP' ;
update document_pp set pubyear = to_date(pubyear, 'DD/MM/YYYY')::text ;
update document_pp set id= '50|pp_______267::'||MD5(id) ;

// DOC_AUTHOR
create table doc_author_pp (docid text, fullname text, rank integer);
copy doc_author_pp (docid, fullname, rank) from '/Users/claudio/workspace/data/d4i/doc_author_pp.csv' CSV ;
update doc_author_pp set fullname = SUBSTRING(fullname, 0, length(fullname) + 1 - 2) where fullname like '%, ';
create table doc_author_pp_u as (select distinct * from doc_author_pp) ;
drop table doc_author_pp;
alter table doc_author_pp_u rename to doc_author_pp ;
update doc_author_pp set docid = '50|pp_______267::'||MD5(docid) ;

// DOC_SUBJECT
create table doc_subject_pp(docid text, subject text, typology text);
create table subject_tmp(id text, descriptor text, qualifiers text);
copy subject_tmp (id, descriptor, qualifiers) from '/Users/claudio/workspace/data/d4i/doc_subject_pp.csv' CSV;
insert into doc_subject_pp select '50|pp_______267::'||MD5(id) as docid, s as subject, 'MeshHeadings' as typology from ( select id, d||'|'||q as s from ( select id, descriptor as d, unnest(regexp_split_to_array(qualifiers, '@')) as q from subject_tmp where qualifiers <> '') as t UNION ALL select distinct id, descriptor as s from subject_tmp) as t ;
create table doc_subject_pp_u as select distinct * from doc_subject_pp;
drop table doc_subject_pp;
alter table doc_subject_pp_u rename to doc_subject_pp;

// DOC_PROJECT
create table doc_project_pp(docid text, projectid text);
create table dp_tmp (docid text, profix text, grantid text) ;
copy dp_tmp(docid, profix, grantid) from '/Users/claudio/workspace/data/d4i/doc_project_pp.csv' CSV;
insert into doc_project_pp select '50|pp_______267::'||MD5(docid), profix||MD5(grantid) from dp_tmp ;

// DOC_OTHER_IDENTIFIER
create table doc_other_identifier_pp(docid text, idtype text, id text);
copy doc_other_identifier_pp (docid, idtype, id) from '/Users/claudio/workspace/data/d4i/doc_other_id.csv' CSV;
update doc_other_identifier_pp set idtype = 'pmid' where idtype = 'pubmed' ;
update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ;
update doc_other_identifier_pp set docid=  '50|pp_______267::'||MD5(docid);


// Caricamento dei csv sul db, cleaning degli idtype, generazione dei subject (mesh), distinct values, ...


create table doc_alias_pp(id text, idpp text);
insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> '';

alter table document_pp             add column existing_docid text;
alter table doc_other_identifier_pp add column existing_docid text;
alter table doc_author_pp           add column existing_docid text;
alter table doc_project_pp          add column existing_docid text;
alter table doc_subject_pp          add column existing_docid text;

update document_pp             set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp;
update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp;
update doc_author_pp           set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid  = doc_alias_pp.idpp;
update doc_project_pp          set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp;
update doc_subject_pp          set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp;

update document_pp             set id    = existing_docid where existing_docid is not null;
update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null;
update doc_author_pp           set docid = existing_docid where existing_docid is not null;
update doc_project_pp          set docid = existing_docid where existing_docid is not null;
update doc_subject_pp          set docid = existing_docid where existing_docid is not null;

alter table document_pp             drop column existing_docid ;
alter table doc_other_identifier_pp drop column existing_docid ;
alter table doc_author_pp           drop column existing_docid ;
alter table doc_project_pp          drop column existing_docid ;
alter table doc_subject_pp          drop column existing_docid ;

-- ONLY FOR MISSING DOCUMENT
insert into document (id, title, abstract, doctype, repository, pubyear, rights) select id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp_______267::%';
insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp_______267::%' on conflict do nothing;

-- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents)
insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid;
insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing;
insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;


----------------------------

-- extract RestPublications publications from the project portfolios

//--PART 1 - to be run in local
	find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | map([.key, .value.title, .value.resulttype, .value.description, .value.dateofacceptance]) | .[] | @csv' {} \; > ../document_pp.csv
	find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.creators | map([$id, .full, .rank])) | .[] | @csv' {} \; > ../doc_author_pp.csv
	find . -name 'FP7*.json' -exec  jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda_______::\",  \$grant ] | @csv  " {} \; > ../doc_project_pp.csv
	find . -name 'H2020*.json' -exec  jq -r ".administrative_data.project_id as \$grant | .publications.rest_publications | to_entries | .[] | [ .key, \"40|corda__h2020::\",  \$grant ] | @csv  " {} \; >> ../doc_project_pp.csv
	find . -name '*.json' -exec jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.subjects | map([$id, .value, .class])) | .[] | @csv' {} \;  > ../doc_subject_pp.csv
	find . -name '*.json' -exec  jq -r '.publications.rest_publications | to_entries | .[] | (.key as $id | .value.pids | map([$id, .value, .class])) | .[] | @csv' {} \; > ../doc_other_id.csv


//DOCUMENTS
	drop table if exists document_pp;
	create table document_pp(id text, title text, doctype text, abstract text, pubyear text, repository text, rights text default 'UNKNOWN');
	copy document_pp (id, title, doctype, abstract, pubyear) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/document_pp.csv' CSV ;
	create table document_pp_unique as (select distinct * from document_pp );
	drop table document_pp;
	alter table document_pp_unique rename to document_pp ;
	update document_pp set repository = 'Rest Publications PP' ;
	update document_pp set id= '50|pp__restpubs::'||MD5(id);

// DOC_AUTHOR
	drop table if exists doc_author_pp;
	create table doc_author_pp (docid text, fullname text, rank integer);
	copy doc_author_pp (docid, fullname, rank) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_author_pp.csv' CSV ;
	create table doc_author_pp_u as (select distinct * from doc_author_pp) ;
	drop table doc_author_pp;
	alter table doc_author_pp_u rename to doc_author_pp ;
	update doc_author_pp set docid = '50|pp__restpubs::'||MD5(docid) ;

// DOC_PROJECT
	drop table if exists doc_project_pp;
	create table doc_project_pp(docid text, projectid text);
	create table dp_tmp (docid text, prefix text, grantid text) ;
	copy dp_tmp(docid, prefix, grantid) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_project_pp.csv' CSV;
	insert into doc_project_pp select distinct '50|pp__restpubs::'||MD5(docid), prefix||MD5(grantid) from dp_tmp ;

// DOC_SUBJECT
	drop table if exists doc_subject_pp;
	create table doc_subject_pp(docid text, subject text, typology text);
	copy doc_subject_pp (docid, subject, typology) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_subject_pp.csv' CSV;
	delete from doc_subject_pp where subject is null OR subject = '';
	create table doc_subject_pp_u as select distinct * from doc_subject_pp;
	drop table doc_subject_pp;
	alter table doc_subject_pp_u rename to doc_subject_pp;
	update doc_subject_pp set docid = '50|pp__restpubs::'||MD5(docid) ;

// DOC_OTHER_IDENTIFIER
	drop table if exists doc_other_identifier_pp;
	create table doc_other_identifier_pp(docid text, id text, idtype text);
	copy doc_other_identifier_pp (docid, id, idtype) from '/Users/michele/Develop/data4impact/data4impact-import-scripts/orig/project_portfolios/doc_other_id.csv' CSV;
	delete from doc_other_identifier_pp where id is null OR id = '';
	create table doc_other_identifier_pp_u as select distinct * from doc_other_identifier_pp;
	drop table doc_other_identifier_pp;
	alter table doc_other_identifier_pp_u rename to doc_other_identifier_pp;
	update doc_other_identifier_pp set idtype = 'pmid'  where idtype = 'pubmed' ;
	update doc_other_identifier_pp set idtype = 'pmcid' where idtype = 'pmc' ;
	update doc_other_identifier_pp set docid =  '50|pp__restpubs::'||MD5(docid);

// -- PART 2 - to be run on the server
	create table doc_alias_pp(id text, idpp text);
	insert into doc_alias_pp select distinct doi.docid as id, pp.docid as idpp from doc_other_identifier_pp pp join doc_other_identifier doi on (doi.id = pp.id and doi.idtype = pp.idtype) where doi.docid is not null and doi.docid <> '';

	alter table document_pp             add column existing_docid text;
	alter table doc_other_identifier_pp add column existing_docid text;
	alter table doc_author_pp           add column existing_docid text;
	alter table doc_project_pp          add column existing_docid text;
	alter table doc_subject_pp          add column existing_docid text;

	update document_pp             set existing_docid = doc_alias_pp.id from doc_alias_pp where document_pp.id = doc_alias_pp.idpp;
	update doc_other_identifier_pp set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_other_identifier_pp.docid = doc_alias_pp.idpp;
	update doc_author_pp           set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_author_pp.docid  = doc_alias_pp.idpp;
	update doc_project_pp          set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_project_pp.docid = doc_alias_pp.idpp;
	update doc_subject_pp          set existing_docid = doc_alias_pp.id from doc_alias_pp where doc_subject_pp.docid = doc_alias_pp.idpp;

	update document_pp             set id    = existing_docid where existing_docid is not null;
	update doc_other_identifier_pp set docid = existing_docid where existing_docid is not null;
	update doc_author_pp           set docid = existing_docid where existing_docid is not null;
	update doc_project_pp          set docid = existing_docid where existing_docid is not null;
	update doc_subject_pp          set docid = existing_docid where existing_docid is not null;

	alter table document_pp             drop column existing_docid ;
	alter table doc_other_identifier_pp drop column existing_docid ;
	alter table doc_author_pp           drop column existing_docid ;
	alter table doc_project_pp          drop column existing_docid ;
	alter table doc_subject_pp          drop column existing_docid ;

-- ONLY FOR MISSING DOCUMENT
insert into document (id, title, abstract, doctype, repository, pubyear, rights) select distinct id, title, abstract, doctype, repository, pubyear, rights from document_pp where id like '50|pp__restpubs::%';
insert into doc_author(docid, fullname, rank) select docid, fullname, rank from doc_author_pp where docid like '50|pp__restpubs::%' on conflict do nothing;
-- FOR ALL DOCUMENTS (I exclude the pii ids because it seems that the same id is associated to many documents)
insert into doc_other_identifier(docid, id, idtype) select distinct docid, id, idtype from doc_other_identifier_pp where idtype != 'pii' on conflict (id,idtype) do update set docid = EXCLUDED.docid;
insert into doc_project(docid, projectid) select docid, projectid from doc_project_pp on conflict do nothing;
insert into doc_subject(docid, subject, typology) select docid, subject, typology from doc_subject_pp on conflict do nothing;