ror import

This commit is contained in:
Michele Artini 2020-07-16 16:11:24 +02:00
parent 5f18425c66
commit b98ff0f497
7 changed files with 129 additions and 51 deletions

View File

@ -1,11 +1,12 @@
#!/bin/bash
INDEXBASEURL=http://localhost:9200
INDEXNAME=events_beta_michele
EVENTINDEXNAME=events_beta_michele
NOTIFICATIONINDEXNAME=notifications_beta_michele
echo
echo "Fixing events index..."
curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mapping" -d '
curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$EVENTINDEXNAME/_mapping" -d '
{
"properties" : {
"map": {
@ -31,7 +32,7 @@ curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mappin
echo
echo
echo "Fixing notifications index..."
curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mapping" -d '
curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$NOTIFICATIONINDEXNAME/_mapping" -d '
{
"properties" : {
"map": {

View File

@ -0,0 +1,15 @@
The first import of the organizations should be performed using the sql script: first_import_grid_ac.sql
1) Download the last dump from https://www.grid.ac
2) Update the paths in the sql script
3) Launch the script
If you want to add missing ROR identifiers:
1) Download ror.json from https://figshare.com/collections/ROR_Data/4596503
2) Update the paths in prepare_grid_ror_update.pl and update_ror_ids.sql
3) Launch prepare_grid_ror_update.pl
4) Launch update_ror_ids.sql
NB: The grid.ac dump is richer then ror dump, Ror does not consider some fiels (city, lat, lng) and hierarchical relationships among the organizations.
If grid.ac will be DEPRACATED we'll start using the import from ror (a script is available: prepare_import_ror.pl)

View File

@ -0,0 +1,43 @@
#!/usr/bin/perl
use File::Path 'make_path';
use JSON::Parse 'json_file_to_perl';
use Data::Dumper;
use strict;
use utf8;
binmode(STDOUT, ":utf8");
# THE LATEST VERSION OF row.json IS AVAILABLE AT https://figshare.com/collections/ROR_Data/4596503
my $inputFile = '../../../../data/ror.json';
my $outputFile = '../../../../data/ror_grid.tsv';
my $data = json_file_to_perl($inputFile);
open(my $OUT, ">$outputFile") or die("Can't open an output file");
binmode($OUT, ":utf8");
foreach my $record (@$data) {
my $rorId = $record->{'id'};
my $gridId = '';
while (my ($type, $v) = each (%{$record->{'external_ids'}})) {
if ($type eq 'GRID') {
$gridId = $v->{'all'};
}
}
if ($rorId && $gridId) {
print $OUT $rorId;
print $OUT "\t";
print $OUT $gridId;
print $OUT "\n";
}
}
close($OUT);
print "\nDone.\n\n";
1;

View File

@ -1,5 +1,7 @@
#!/usr/bin/perl
# NOTE: this script MUST NOT BE USED, it will be used only if grid.ac will be deprecated
use File::Path 'make_path';
use Digest::MD5 qw(md5_hex);
use JSON::Parse 'json_file_to_perl';
@ -32,20 +34,16 @@ binmode($OUT_ACRONYMS, ":utf8");
binmode($OUT_RELS, ":utf8");
binmode($OUT_URLS, ":utf8");
foreach my $record (@$data) {
if ($record->{'id'}) {
my $id = 'tmp::' . md5_hex($record->{'id'});
my $id = 'tmp::' . md5_hex($record->{'id'});
# print Dumper $record;
# die;
write_orgs($id, $record, $OUT_ORGS);
write_other_ids($id, $record, $OUT_OTHER_IDS);
write_other_names($id, $record, $OUT_OTHER_NAMES);
write_acronyms($id, $record, $OUT_ACRONYMS);
write_rels($id, $record, $OUT_RELS);
write_urls($id, $record, $OUT_URLS);
}
write_orgs($id, $record, $OUT_ORGS);
write_other_ids($id, $record, $OUT_OTHER_IDS);
write_other_names($id, $record, $OUT_OTHER_NAMES);
write_acronyms($id, $record, $OUT_ACRONYMS);
# write_rels($id, $record, $OUT_RELS);
write_urls($id, $record, $OUT_URLS);
}
close($OUT_ORGS);
@ -67,9 +65,9 @@ sub write_orgs {
print $OUT "\t";
print $OUT getFirstArrayElem($record->{'types'}, 'UNKNOWN');
print $OUT "\t";
print $OUT ""; # lat - TODO MISSING
print $OUT 0; # lat - TODO MISSING
print $OUT "\t";
print $OUT ""; # lng - TODO MISSING
print $OUT 0; # lng - TODO MISSING
print $OUT "\t";
print $OUT ""; # city - TODO MISSING
print $OUT "\t";
@ -87,12 +85,14 @@ sub write_other_ids {
_write_other_ids($id, $record->{'id'}, 'ror', $OUT);
while (my ($type, $v) = each (%{$record->{'external_ids'}})) {
if ($type eq 'GRID') {
_write_other_ids($id, $v->{'all'}, 'grid.ac', $OUT);
} else {
foreach my $other (@{$v->{'all'}}) {
my $all = $v->{'all'};
if (ref $all eq 'ARRAY') {
foreach my $other (@$all) {
_write_other_ids($id, $other, $type, $OUT);
}
} else {
_write_other_ids($id, $all, $type, $OUT);
}
}
}
@ -157,12 +157,12 @@ sub write_acronyms {
sub write_rels {
my ($id, $record, $OUT) = @_;
print $OUT $id;
print $OUT "\t";
print $OUT ""; # reltype - TODO
print $OUT "\t";
print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id)
print $OUT "\n";
# print $OUT $id;
# print $OUT "\t";
# print $OUT ""; # reltype - TODO
# print $OUT "\t";
# print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id)
# print $OUT "\n";
}
sub write_urls {

View File

@ -78,20 +78,20 @@ CREATE TEMP TABLE grid_aliases (
alias text
);
COPY grid_institutes (grid_id,name,wikipedia_url,email_address,established) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/institutes.csv' CSV HEADER;
COPY grid_geonames (geonames_city_id,city,nuts_level1_code,nuts_level1_name,nuts_level2_code,nuts_level2_name,nuts_level3_code,nuts_level3_name,geonames_admin1_code,geonames_admin1_name,geonames_admin1_ascii_name,geonames_admin2_code,geonames_admin2_name,geonames_admin2_ascii_name) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/geonames.csv' CSV HEADER;
COPY grid_addresses (grid_id,line_1,line_2,line_3,lat,lng,postcode,is_primary,city,state,state_code,country,country_code,geonames_city_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/addresses.csv' CSV HEADER;
COPY grid_external_ids (grid_id,external_id_type,external_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/external_ids.csv' CSV HEADER;
COPY grid_labels (grid_id,iso639,label) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/labels.csv' CSV HEADER;
COPY grid_relationships (grid_id,relationship_type,related_grid_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/relationships.csv' CSV HEADER;
COPY grid_types (grid_id,type) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/types.csv' CSV HEADER;
COPY grid_links (grid_id,link) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/links.csv' CSV HEADER;
COPY grid_acronyms (grid_id,acronym) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/acronyms.csv' CSV HEADER;
COPY grid_aliases (grid_id,alias) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/aliases.csv' CSV HEADER;
COPY grid_institutes (grid_id,name,wikipedia_url,email_address,established) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/institutes.csv' CSV HEADER;
COPY grid_geonames (geonames_city_id,city,nuts_level1_code,nuts_level1_name,nuts_level2_code,nuts_level2_name,nuts_level3_code,nuts_level3_name,geonames_admin1_code,geonames_admin1_name,geonames_admin1_ascii_name,geonames_admin2_code,geonames_admin2_name,geonames_admin2_ascii_name) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/geonames.csv' CSV HEADER;
COPY grid_addresses (grid_id,line_1,line_2,line_3,lat,lng,postcode,is_primary,city,state,state_code,country,country_code,geonames_city_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/addresses.csv' CSV HEADER;
COPY grid_external_ids (grid_id,external_id_type,external_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/external_ids.csv' CSV HEADER;
COPY grid_labels (grid_id,iso639,label) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/labels.csv' CSV HEADER;
COPY grid_relationships (grid_id,relationship_type,related_grid_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/relationships.csv' CSV HEADER;
COPY grid_types (grid_id,type) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/types.csv' CSV HEADER;
COPY grid_links (grid_id,link) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/links.csv' CSV HEADER;
COPY grid_acronyms (grid_id,acronym) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/acronyms.csv' CSV HEADER;
COPY grid_aliases (grid_id,alias) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/aliases.csv' CSV HEADER;
INSERT INTO organizations(id, name, type, lat, lng, city, country, created_by, modified_by) (SELECT 'tmp::'||md5(o.grid_id), o.name, COALESCE(t.type, 'UNKNOWN'), a.lat, a.lng, a.city, a.country_code, 'import:grid.ac', 'import:grid.ac' FROM grid_institutes o LEFT OUTER JOIN grid_addresses a ON (o.grid_id=a.grid_id) LEFT OUTER JOIN grid_types t ON (o.grid_id=t.grid_id)) ON CONFLICT DO NOTHING;
INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), grid_id, 'grid.ac' FROM grid_institutes ) ON CONFLICT DO NOTHING;
INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), grid_id, 'GRID' FROM grid_institutes ) ON CONFLICT DO NOTHING;
INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), external_id, external_id_type FROM grid_external_ids ) ON CONFLICT DO NOTHING;
INSERT INTO other_names (id, lang, name) (SELECT 'tmp::'||md5(grid_id), 'en', name FROM grid_institutes ) ON CONFLICT DO NOTHING;
INSERT INTO other_names (id, lang, name) (SELECT 'tmp::'||md5(grid_id), iso639, label FROM grid_labels ) ON CONFLICT DO NOTHING;

View File

@ -1,10 +1,10 @@
DROP VIEW organizations_view;
DROP VIEW organizations_info_view;
DROP VIEW organizations_simple_view;
DROP VIEW users_view;
DROP VIEW conflict_groups_view;
DROP VIEW suggestions_info_by_country_view;
DROP VIEW duplicate_groups_view
DROP VIEW IF EXISTS organizations_view;
DROP VIEW IF EXISTS organizations_info_view;
DROP VIEW IF EXISTS organizations_simple_view;
DROP VIEW IF EXISTS users_view;
DROP VIEW IF EXISTS conflict_groups_view;
DROP VIEW IF EXISTS suggestions_info_by_country_view;
DROP VIEW IF EXISTS duplicate_groups_view;
DROP TABLE IF EXISTS other_ids;
DROP TABLE IF EXISTS other_names;
@ -30,14 +30,16 @@ CREATE TABLE org_types (val text PRIMARY KEY);
INSERT INTO org_types VALUES ('Archive'), ('Company'), ('Education'), ('Facility'), ('Government'), ('Healthcare'), ('Nonprofit'), ('Other'), ('UNKNOWN');
CREATE TABLE id_types (val text PRIMARY KEY);
INSERT INTO id_types VALUES ('CNRS'), ('FundRef'), ('HESA'), ('ISNI'), ('OrgRef'), ('UCAS'), ('UKPRN'), ('Wikidata'), ('grid.ac');
INSERT INTO id_types VALUES ('CNRS'), ('FundRef'), ('HESA'), ('ISNI'), ('LinkedIn'), ('OrgRef'), ('UCAS'), ('UKPRN'), ('Wikidata'), ('GRID'), ('ROR');
CREATE TABLE languages (val text PRIMARY KEY);
INSERT INTO languages VALUES ('UNKNOWN'), ('aa'), ('af'), ('am'), ('ar'), ('as'), ('az'), ('ba'), ('be'), ('bg'), ('bn'), ('br'), ('bs'), ('ca'), ('ch'), ('co'), ('cs'), ('cy'), ('da'),
('de'), ('dv'), ('dz'), ('el'), ('en'), ('eo'), ('es'), ('et'), ('eu'), ('fa'), ('fi'), ('fr'), ('fy'), ('ga'), ('gd'), ('gl'), ('gu'), ('he'), ('hi'), ('hr'), ('hu'), ('hy'), ('id'), ('is'),
('it'), ('iu'), ('ja'), ('jv'), ('ka'), ('kk'), ('kl'), ('km'), ('kn'), ('ko'), ('ku'), ('ky'), ('la'), ('lb'), ('lo'), ('lt'), ('lv'), ('mg'), ('mi'), ('mk'), ('ml'), ('mn'), ('mr'), ('ms'),
('mt'), ('my'), ('nb'), ('ne'), ('nl'), ('nn'), ('no'), ('oc'), ('or'), ('pa'), ('pl'), ('ps'), ('pt'), ('rm'), ('ro'), ('ru'), ('rw'), ('sa'), ('sd'), ('si'), ('sk'), ('sl'), ('so'), ('sq'),
('sr'), ('sv'), ('sw'), ('ta'), ('te'), ('tg'), ('th'), ('tk'), ('tl'), ('tr'), ('tt'), ('ug'), ('uk'), ('ur'), ('uz'), ('vi'), ('xh'), ('yo'), ('zh'), ('zu');
INSERT INTO languages VALUES ('UNKNOWN'),
('aa'),('af'),('am'),('ar'),('as'),('az'),('ba'),('be'),('bg'),('bn'),('br'),('bs'),('ca'),('ch'),('co'),('cs'),('cy'),('da'),('de'),('dv'),
('dz'),('el'),('en'),('eo'),('es'),('et'),('eu'),('fa'),('fi'),('fo'),('fr'),('fy'),('ga'),('gd'),('gl'),('gu'),('he'),('hi'),('hr'),('ht'),
('hu'),('hy'),('id'),('is'),('it'),('iu'),('ja'),('jv'),('ka'),('kk'),('kl'),('km'),('kn'),('ko'),('ku'),('ky'),('la'),('lb'),('lo'),('lt'),
('lv'),('mg'),('mi'),('mk'),('ml'),('mn'),('mr'),('ms'),('mt'),('my'),('nb'),('ne'),('nl'),('no'),('oc'),('om'),('or'),('pa'),('pl'),('ps'),
('pt'),('rm'),('ro'),('ru'),('rw'),('sa'),('sd'),('si'),('sk'),('sl'),('sm'),('so'),('sq'),('sr'),('sv'),('sw'),('ta'),('te'),('tg'),('th'),
('tk'),('tl'),('tr'),('tt'),('ug'),('uk'),('ur'),('uz'),('vi'),('xh'),('yo'),('zh'),('zu');
CREATE TABLE countries (val text PRIMARY KEY);
INSERT INTO countries VALUES ('UNKNOWN'), ('AD'), ('AE'), ('AF'), ('AG'), ('AL'), ('AM'), ('AO'), ('AR'), ('AT'), ('AU'), ('AW'), ('AX'), ('AZ'), ('BA'), ('BB'), ('BD'), ('BE'), ('BF'), ('BG'), ('BH'), ('BI'),

View File

@ -0,0 +1,17 @@
CREATE TEMPORARY TABLE temp_ror_grid (
ror_id text,
grid_id text
);
COPY temp_ror_grid (ror_id, grid_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/ror_grid.tsv' DELIMITER E'\t';
INSERT INTO other_ids(id, otherid, type) (SELECT
o.id as id,
ror.ror_id as otherid,
'ROR' as type
FROM
organizations o
JOIN other_ids oth ON (o.id = oth.id AND oth.type = 'GRID')
JOIN temp_ror_grid ror ON (oth.otherid = ror.grid_id)
) ON CONFLICT DO NOTHING;