diff --git a/apps/dhp-broker-application/src/main/resources/config/mapping_es7.sh b/apps/dhp-broker-application/src/main/resources/config/mapping_es7.sh index d9daba52..eb851b8a 100755 --- a/apps/dhp-broker-application/src/main/resources/config/mapping_es7.sh +++ b/apps/dhp-broker-application/src/main/resources/config/mapping_es7.sh @@ -1,11 +1,12 @@ #!/bin/bash INDEXBASEURL=http://localhost:9200 -INDEXNAME=events_beta_michele +EVENTINDEXNAME=events_beta_michele +NOTIFICATIONINDEXNAME=notifications_beta_michele echo echo "Fixing events index..." -curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mapping" -d ' +curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$EVENTINDEXNAME/_mapping" -d ' { "properties" : { "map": { @@ -31,7 +32,7 @@ curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mappin echo echo echo "Fixing notifications index..." -curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mapping" -d ' +curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$NOTIFICATIONINDEXNAME/_mapping" -d ' { "properties" : { "map": { diff --git a/apps/dnet-orgs-database-application/README.txt b/apps/dnet-orgs-database-application/README.txt new file mode 100644 index 00000000..3ff268d2 --- /dev/null +++ b/apps/dnet-orgs-database-application/README.txt @@ -0,0 +1,15 @@ +The first import of the organizations should be performed using the sql script: first_import_grid_ac.sql + +1) Download the last dump from https://www.grid.ac +2) Update the paths in the sql script +3) Launch the script + +If you want to add missing ROR identifiers: + +1) Download ror.json from https://figshare.com/collections/ROR_Data/4596503 +2) Update the paths in prepare_grid_ror_update.pl and update_ror_ids.sql +3) Launch prepare_grid_ror_update.pl +4) Launch update_ror_ids.sql + +NB: The grid.ac dump is richer then ror dump, Ror does not consider some fiels (city, lat, lng) and hierarchical relationships among the organizations. +If grid.ac will be DEPRACATED we'll start using the import from ror (a script is available: prepare_import_ror.pl) diff --git a/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_grid_ror_update.pl b/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_grid_ror_update.pl new file mode 100755 index 00000000..c0a6f024 --- /dev/null +++ b/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_grid_ror_update.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl + + +use File::Path 'make_path'; +use JSON::Parse 'json_file_to_perl'; +use Data::Dumper; +use strict; +use utf8; + +binmode(STDOUT, ":utf8"); + +# THE LATEST VERSION OF row.json IS AVAILABLE AT https://figshare.com/collections/ROR_Data/4596503 + +my $inputFile = '../../../../data/ror.json'; +my $outputFile = '../../../../data/ror_grid.tsv'; + +my $data = json_file_to_perl($inputFile); + +open(my $OUT, ">$outputFile") or die("Can't open an output file"); +binmode($OUT, ":utf8"); + +foreach my $record (@$data) { + my $rorId = $record->{'id'}; + my $gridId = ''; + while (my ($type, $v) = each (%{$record->{'external_ids'}})) { + if ($type eq 'GRID') { + $gridId = $v->{'all'}; + } + } + + if ($rorId && $gridId) { + print $OUT $rorId; + print $OUT "\t"; + print $OUT $gridId; + print $OUT "\n"; + } +} + +close($OUT); + +print "\nDone.\n\n"; + +1; diff --git a/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl b/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl old mode 100644 new mode 100755 index 9c86f567..ff7165c1 --- a/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl +++ b/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl @@ -1,5 +1,7 @@ #!/usr/bin/perl +# NOTE: this script MUST NOT BE USED, it will be used only if grid.ac will be deprecated + use File::Path 'make_path'; use Digest::MD5 qw(md5_hex); use JSON::Parse 'json_file_to_perl'; @@ -32,20 +34,16 @@ binmode($OUT_ACRONYMS, ":utf8"); binmode($OUT_RELS, ":utf8"); binmode($OUT_URLS, ":utf8"); + foreach my $record (@$data) { - if ($record->{'id'}) { - my $id = 'tmp::' . md5_hex($record->{'id'}); + my $id = 'tmp::' . md5_hex($record->{'id'}); -# print Dumper $record; -# die; - - write_orgs($id, $record, $OUT_ORGS); - write_other_ids($id, $record, $OUT_OTHER_IDS); - write_other_names($id, $record, $OUT_OTHER_NAMES); - write_acronyms($id, $record, $OUT_ACRONYMS); - write_rels($id, $record, $OUT_RELS); - write_urls($id, $record, $OUT_URLS); - } + write_orgs($id, $record, $OUT_ORGS); + write_other_ids($id, $record, $OUT_OTHER_IDS); + write_other_names($id, $record, $OUT_OTHER_NAMES); + write_acronyms($id, $record, $OUT_ACRONYMS); +# write_rels($id, $record, $OUT_RELS); + write_urls($id, $record, $OUT_URLS); } close($OUT_ORGS); @@ -67,9 +65,9 @@ sub write_orgs { print $OUT "\t"; print $OUT getFirstArrayElem($record->{'types'}, 'UNKNOWN'); print $OUT "\t"; - print $OUT ""; # lat - TODO MISSING + print $OUT 0; # lat - TODO MISSING print $OUT "\t"; - print $OUT ""; # lng - TODO MISSING + print $OUT 0; # lng - TODO MISSING print $OUT "\t"; print $OUT ""; # city - TODO MISSING print $OUT "\t"; @@ -87,12 +85,14 @@ sub write_other_ids { _write_other_ids($id, $record->{'id'}, 'ror', $OUT); while (my ($type, $v) = each (%{$record->{'external_ids'}})) { - if ($type eq 'GRID') { - _write_other_ids($id, $v->{'all'}, 'grid.ac', $OUT); - } else { - foreach my $other (@{$v->{'all'}}) { + my $all = $v->{'all'}; + + if (ref $all eq 'ARRAY') { + foreach my $other (@$all) { _write_other_ids($id, $other, $type, $OUT); } + } else { + _write_other_ids($id, $all, $type, $OUT); } } } @@ -157,12 +157,12 @@ sub write_acronyms { sub write_rels { my ($id, $record, $OUT) = @_; - print $OUT $id; - print $OUT "\t"; - print $OUT ""; # reltype - TODO - print $OUT "\t"; - print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id) - print $OUT "\n"; +# print $OUT $id; +# print $OUT "\t"; +# print $OUT ""; # reltype - TODO +# print $OUT "\t"; +# print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id) +# print $OUT "\n"; } sub write_urls { diff --git a/apps/dnet-orgs-database-application/src/main/resources/sql/import_grid_ac.sql b/apps/dnet-orgs-database-application/src/main/resources/sql/first_import_grid_ac.sql similarity index 72% rename from apps/dnet-orgs-database-application/src/main/resources/sql/import_grid_ac.sql rename to apps/dnet-orgs-database-application/src/main/resources/sql/first_import_grid_ac.sql index d8b160e2..1b2cfdcd 100644 --- a/apps/dnet-orgs-database-application/src/main/resources/sql/import_grid_ac.sql +++ b/apps/dnet-orgs-database-application/src/main/resources/sql/first_import_grid_ac.sql @@ -78,20 +78,20 @@ CREATE TEMP TABLE grid_aliases ( alias text ); -COPY grid_institutes (grid_id,name,wikipedia_url,email_address,established) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/institutes.csv' CSV HEADER; -COPY grid_geonames (geonames_city_id,city,nuts_level1_code,nuts_level1_name,nuts_level2_code,nuts_level2_name,nuts_level3_code,nuts_level3_name,geonames_admin1_code,geonames_admin1_name,geonames_admin1_ascii_name,geonames_admin2_code,geonames_admin2_name,geonames_admin2_ascii_name) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/geonames.csv' CSV HEADER; -COPY grid_addresses (grid_id,line_1,line_2,line_3,lat,lng,postcode,is_primary,city,state,state_code,country,country_code,geonames_city_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/addresses.csv' CSV HEADER; -COPY grid_external_ids (grid_id,external_id_type,external_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/external_ids.csv' CSV HEADER; -COPY grid_labels (grid_id,iso639,label) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/labels.csv' CSV HEADER; -COPY grid_relationships (grid_id,relationship_type,related_grid_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/relationships.csv' CSV HEADER; -COPY grid_types (grid_id,type) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/types.csv' CSV HEADER; -COPY grid_links (grid_id,link) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/links.csv' CSV HEADER; -COPY grid_acronyms (grid_id,acronym) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/acronyms.csv' CSV HEADER; -COPY grid_aliases (grid_id,alias) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/aliases.csv' CSV HEADER; +COPY grid_institutes (grid_id,name,wikipedia_url,email_address,established) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/institutes.csv' CSV HEADER; +COPY grid_geonames (geonames_city_id,city,nuts_level1_code,nuts_level1_name,nuts_level2_code,nuts_level2_name,nuts_level3_code,nuts_level3_name,geonames_admin1_code,geonames_admin1_name,geonames_admin1_ascii_name,geonames_admin2_code,geonames_admin2_name,geonames_admin2_ascii_name) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/geonames.csv' CSV HEADER; +COPY grid_addresses (grid_id,line_1,line_2,line_3,lat,lng,postcode,is_primary,city,state,state_code,country,country_code,geonames_city_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/addresses.csv' CSV HEADER; +COPY grid_external_ids (grid_id,external_id_type,external_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/external_ids.csv' CSV HEADER; +COPY grid_labels (grid_id,iso639,label) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/labels.csv' CSV HEADER; +COPY grid_relationships (grid_id,relationship_type,related_grid_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/relationships.csv' CSV HEADER; +COPY grid_types (grid_id,type) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/types.csv' CSV HEADER; +COPY grid_links (grid_id,link) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/links.csv' CSV HEADER; +COPY grid_acronyms (grid_id,acronym) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/acronyms.csv' CSV HEADER; +COPY grid_aliases (grid_id,alias) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/aliases.csv' CSV HEADER; INSERT INTO organizations(id, name, type, lat, lng, city, country, created_by, modified_by) (SELECT 'tmp::'||md5(o.grid_id), o.name, COALESCE(t.type, 'UNKNOWN'), a.lat, a.lng, a.city, a.country_code, 'import:grid.ac', 'import:grid.ac' FROM grid_institutes o LEFT OUTER JOIN grid_addresses a ON (o.grid_id=a.grid_id) LEFT OUTER JOIN grid_types t ON (o.grid_id=t.grid_id)) ON CONFLICT DO NOTHING; -INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), grid_id, 'grid.ac' FROM grid_institutes ) ON CONFLICT DO NOTHING; +INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), grid_id, 'GRID' FROM grid_institutes ) ON CONFLICT DO NOTHING; INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), external_id, external_id_type FROM grid_external_ids ) ON CONFLICT DO NOTHING; INSERT INTO other_names (id, lang, name) (SELECT 'tmp::'||md5(grid_id), 'en', name FROM grid_institutes ) ON CONFLICT DO NOTHING; INSERT INTO other_names (id, lang, name) (SELECT 'tmp::'||md5(grid_id), iso639, label FROM grid_labels ) ON CONFLICT DO NOTHING; diff --git a/apps/dnet-orgs-database-application/src/main/resources/sql/schema.sql b/apps/dnet-orgs-database-application/src/main/resources/sql/schema.sql index c2844536..c7faf500 100644 --- a/apps/dnet-orgs-database-application/src/main/resources/sql/schema.sql +++ b/apps/dnet-orgs-database-application/src/main/resources/sql/schema.sql @@ -1,10 +1,10 @@ -DROP VIEW organizations_view; -DROP VIEW organizations_info_view; -DROP VIEW organizations_simple_view; -DROP VIEW users_view; -DROP VIEW conflict_groups_view; -DROP VIEW suggestions_info_by_country_view; -DROP VIEW duplicate_groups_view +DROP VIEW IF EXISTS organizations_view; +DROP VIEW IF EXISTS organizations_info_view; +DROP VIEW IF EXISTS organizations_simple_view; +DROP VIEW IF EXISTS users_view; +DROP VIEW IF EXISTS conflict_groups_view; +DROP VIEW IF EXISTS suggestions_info_by_country_view; +DROP VIEW IF EXISTS duplicate_groups_view; DROP TABLE IF EXISTS other_ids; DROP TABLE IF EXISTS other_names; @@ -30,14 +30,16 @@ CREATE TABLE org_types (val text PRIMARY KEY); INSERT INTO org_types VALUES ('Archive'), ('Company'), ('Education'), ('Facility'), ('Government'), ('Healthcare'), ('Nonprofit'), ('Other'), ('UNKNOWN'); CREATE TABLE id_types (val text PRIMARY KEY); -INSERT INTO id_types VALUES ('CNRS'), ('FundRef'), ('HESA'), ('ISNI'), ('OrgRef'), ('UCAS'), ('UKPRN'), ('Wikidata'), ('grid.ac'); +INSERT INTO id_types VALUES ('CNRS'), ('FundRef'), ('HESA'), ('ISNI'), ('LinkedIn'), ('OrgRef'), ('UCAS'), ('UKPRN'), ('Wikidata'), ('GRID'), ('ROR'); CREATE TABLE languages (val text PRIMARY KEY); -INSERT INTO languages VALUES ('UNKNOWN'), ('aa'), ('af'), ('am'), ('ar'), ('as'), ('az'), ('ba'), ('be'), ('bg'), ('bn'), ('br'), ('bs'), ('ca'), ('ch'), ('co'), ('cs'), ('cy'), ('da'), - ('de'), ('dv'), ('dz'), ('el'), ('en'), ('eo'), ('es'), ('et'), ('eu'), ('fa'), ('fi'), ('fr'), ('fy'), ('ga'), ('gd'), ('gl'), ('gu'), ('he'), ('hi'), ('hr'), ('hu'), ('hy'), ('id'), ('is'), - ('it'), ('iu'), ('ja'), ('jv'), ('ka'), ('kk'), ('kl'), ('km'), ('kn'), ('ko'), ('ku'), ('ky'), ('la'), ('lb'), ('lo'), ('lt'), ('lv'), ('mg'), ('mi'), ('mk'), ('ml'), ('mn'), ('mr'), ('ms'), - ('mt'), ('my'), ('nb'), ('ne'), ('nl'), ('nn'), ('no'), ('oc'), ('or'), ('pa'), ('pl'), ('ps'), ('pt'), ('rm'), ('ro'), ('ru'), ('rw'), ('sa'), ('sd'), ('si'), ('sk'), ('sl'), ('so'), ('sq'), - ('sr'), ('sv'), ('sw'), ('ta'), ('te'), ('tg'), ('th'), ('tk'), ('tl'), ('tr'), ('tt'), ('ug'), ('uk'), ('ur'), ('uz'), ('vi'), ('xh'), ('yo'), ('zh'), ('zu'); +INSERT INTO languages VALUES ('UNKNOWN'), + ('aa'),('af'),('am'),('ar'),('as'),('az'),('ba'),('be'),('bg'),('bn'),('br'),('bs'),('ca'),('ch'),('co'),('cs'),('cy'),('da'),('de'),('dv'), + ('dz'),('el'),('en'),('eo'),('es'),('et'),('eu'),('fa'),('fi'),('fo'),('fr'),('fy'),('ga'),('gd'),('gl'),('gu'),('he'),('hi'),('hr'),('ht'), + ('hu'),('hy'),('id'),('is'),('it'),('iu'),('ja'),('jv'),('ka'),('kk'),('kl'),('km'),('kn'),('ko'),('ku'),('ky'),('la'),('lb'),('lo'),('lt'), + ('lv'),('mg'),('mi'),('mk'),('ml'),('mn'),('mr'),('ms'),('mt'),('my'),('nb'),('ne'),('nl'),('no'),('oc'),('om'),('or'),('pa'),('pl'),('ps'), + ('pt'),('rm'),('ro'),('ru'),('rw'),('sa'),('sd'),('si'),('sk'),('sl'),('sm'),('so'),('sq'),('sr'),('sv'),('sw'),('ta'),('te'),('tg'),('th'), + ('tk'),('tl'),('tr'),('tt'),('ug'),('uk'),('ur'),('uz'),('vi'),('xh'),('yo'),('zh'),('zu'); CREATE TABLE countries (val text PRIMARY KEY); INSERT INTO countries VALUES ('UNKNOWN'), ('AD'), ('AE'), ('AF'), ('AG'), ('AL'), ('AM'), ('AO'), ('AR'), ('AT'), ('AU'), ('AW'), ('AX'), ('AZ'), ('BA'), ('BB'), ('BD'), ('BE'), ('BF'), ('BG'), ('BH'), ('BI'), diff --git a/apps/dnet-orgs-database-application/src/main/resources/sql/update_ror_ids.sql b/apps/dnet-orgs-database-application/src/main/resources/sql/update_ror_ids.sql new file mode 100644 index 00000000..5afddceb --- /dev/null +++ b/apps/dnet-orgs-database-application/src/main/resources/sql/update_ror_ids.sql @@ -0,0 +1,17 @@ +CREATE TEMPORARY TABLE temp_ror_grid ( + ror_id text, + grid_id text +); + +COPY temp_ror_grid (ror_id, grid_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/ror_grid.tsv' DELIMITER E'\t'; + + +INSERT INTO other_ids(id, otherid, type) (SELECT + o.id as id, + ror.ror_id as otherid, + 'ROR' as type +FROM + organizations o + JOIN other_ids oth ON (o.id = oth.id AND oth.type = 'GRID') + JOIN temp_ror_grid ror ON (oth.otherid = ror.grid_id) +) ON CONFLICT DO NOTHING;