ror import
This commit is contained in:
parent
5f18425c66
commit
b98ff0f497
|
@ -1,11 +1,12 @@
|
|||
#!/bin/bash
|
||||
|
||||
INDEXBASEURL=http://localhost:9200
|
||||
INDEXNAME=events_beta_michele
|
||||
EVENTINDEXNAME=events_beta_michele
|
||||
NOTIFICATIONINDEXNAME=notifications_beta_michele
|
||||
|
||||
echo
|
||||
echo "Fixing events index..."
|
||||
curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mapping" -d '
|
||||
curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$EVENTINDEXNAME/_mapping" -d '
|
||||
{
|
||||
"properties" : {
|
||||
"map": {
|
||||
|
@ -31,7 +32,7 @@ curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mappin
|
|||
echo
|
||||
echo
|
||||
echo "Fixing notifications index..."
|
||||
curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$INDEXNAME/_mapping" -d '
|
||||
curl -XPUT -H 'Content-Type: application/json' "$INDEXBASEURL/$NOTIFICATIONINDEXNAME/_mapping" -d '
|
||||
{
|
||||
"properties" : {
|
||||
"map": {
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
The first import of the organizations should be performed using the sql script: first_import_grid_ac.sql
|
||||
|
||||
1) Download the last dump from https://www.grid.ac
|
||||
2) Update the paths in the sql script
|
||||
3) Launch the script
|
||||
|
||||
If you want to add missing ROR identifiers:
|
||||
|
||||
1) Download ror.json from https://figshare.com/collections/ROR_Data/4596503
|
||||
2) Update the paths in prepare_grid_ror_update.pl and update_ror_ids.sql
|
||||
3) Launch prepare_grid_ror_update.pl
|
||||
4) Launch update_ror_ids.sql
|
||||
|
||||
NB: The grid.ac dump is richer then ror dump, Ror does not consider some fiels (city, lat, lng) and hierarchical relationships among the organizations.
|
||||
If grid.ac will be DEPRACATED we'll start using the import from ror (a script is available: prepare_import_ror.pl)
|
|
@ -0,0 +1,43 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
|
||||
use File::Path 'make_path';
|
||||
use JSON::Parse 'json_file_to_perl';
|
||||
use Data::Dumper;
|
||||
use strict;
|
||||
use utf8;
|
||||
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
# THE LATEST VERSION OF row.json IS AVAILABLE AT https://figshare.com/collections/ROR_Data/4596503
|
||||
|
||||
my $inputFile = '../../../../data/ror.json';
|
||||
my $outputFile = '../../../../data/ror_grid.tsv';
|
||||
|
||||
my $data = json_file_to_perl($inputFile);
|
||||
|
||||
open(my $OUT, ">$outputFile") or die("Can't open an output file");
|
||||
binmode($OUT, ":utf8");
|
||||
|
||||
foreach my $record (@$data) {
|
||||
my $rorId = $record->{'id'};
|
||||
my $gridId = '';
|
||||
while (my ($type, $v) = each (%{$record->{'external_ids'}})) {
|
||||
if ($type eq 'GRID') {
|
||||
$gridId = $v->{'all'};
|
||||
}
|
||||
}
|
||||
|
||||
if ($rorId && $gridId) {
|
||||
print $OUT $rorId;
|
||||
print $OUT "\t";
|
||||
print $OUT $gridId;
|
||||
print $OUT "\n";
|
||||
}
|
||||
}
|
||||
|
||||
close($OUT);
|
||||
|
||||
print "\nDone.\n\n";
|
||||
|
||||
1;
|
48
apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl
Normal file → Executable file
48
apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl
Normal file → Executable file
|
@ -1,5 +1,7 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
# NOTE: this script MUST NOT BE USED, it will be used only if grid.ac will be deprecated
|
||||
|
||||
use File::Path 'make_path';
|
||||
use Digest::MD5 qw(md5_hex);
|
||||
use JSON::Parse 'json_file_to_perl';
|
||||
|
@ -32,20 +34,16 @@ binmode($OUT_ACRONYMS, ":utf8");
|
|||
binmode($OUT_RELS, ":utf8");
|
||||
binmode($OUT_URLS, ":utf8");
|
||||
|
||||
|
||||
foreach my $record (@$data) {
|
||||
if ($record->{'id'}) {
|
||||
my $id = 'tmp::' . md5_hex($record->{'id'});
|
||||
my $id = 'tmp::' . md5_hex($record->{'id'});
|
||||
|
||||
# print Dumper $record;
|
||||
# die;
|
||||
|
||||
write_orgs($id, $record, $OUT_ORGS);
|
||||
write_other_ids($id, $record, $OUT_OTHER_IDS);
|
||||
write_other_names($id, $record, $OUT_OTHER_NAMES);
|
||||
write_acronyms($id, $record, $OUT_ACRONYMS);
|
||||
write_rels($id, $record, $OUT_RELS);
|
||||
write_urls($id, $record, $OUT_URLS);
|
||||
}
|
||||
write_orgs($id, $record, $OUT_ORGS);
|
||||
write_other_ids($id, $record, $OUT_OTHER_IDS);
|
||||
write_other_names($id, $record, $OUT_OTHER_NAMES);
|
||||
write_acronyms($id, $record, $OUT_ACRONYMS);
|
||||
# write_rels($id, $record, $OUT_RELS);
|
||||
write_urls($id, $record, $OUT_URLS);
|
||||
}
|
||||
|
||||
close($OUT_ORGS);
|
||||
|
@ -67,9 +65,9 @@ sub write_orgs {
|
|||
print $OUT "\t";
|
||||
print $OUT getFirstArrayElem($record->{'types'}, 'UNKNOWN');
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # lat - TODO MISSING
|
||||
print $OUT 0; # lat - TODO MISSING
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # lng - TODO MISSING
|
||||
print $OUT 0; # lng - TODO MISSING
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # city - TODO MISSING
|
||||
print $OUT "\t";
|
||||
|
@ -87,12 +85,14 @@ sub write_other_ids {
|
|||
_write_other_ids($id, $record->{'id'}, 'ror', $OUT);
|
||||
|
||||
while (my ($type, $v) = each (%{$record->{'external_ids'}})) {
|
||||
if ($type eq 'GRID') {
|
||||
_write_other_ids($id, $v->{'all'}, 'grid.ac', $OUT);
|
||||
} else {
|
||||
foreach my $other (@{$v->{'all'}}) {
|
||||
my $all = $v->{'all'};
|
||||
|
||||
if (ref $all eq 'ARRAY') {
|
||||
foreach my $other (@$all) {
|
||||
_write_other_ids($id, $other, $type, $OUT);
|
||||
}
|
||||
} else {
|
||||
_write_other_ids($id, $all, $type, $OUT);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -157,12 +157,12 @@ sub write_acronyms {
|
|||
sub write_rels {
|
||||
my ($id, $record, $OUT) = @_;
|
||||
|
||||
print $OUT $id;
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # reltype - TODO
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id)
|
||||
print $OUT "\n";
|
||||
# print $OUT $id;
|
||||
# print $OUT "\t";
|
||||
# print $OUT ""; # reltype - TODO
|
||||
# print $OUT "\t";
|
||||
# print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id)
|
||||
# print $OUT "\n";
|
||||
}
|
||||
|
||||
sub write_urls {
|
||||
|
|
|
@ -78,20 +78,20 @@ CREATE TEMP TABLE grid_aliases (
|
|||
alias text
|
||||
);
|
||||
|
||||
COPY grid_institutes (grid_id,name,wikipedia_url,email_address,established) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/institutes.csv' CSV HEADER;
|
||||
COPY grid_geonames (geonames_city_id,city,nuts_level1_code,nuts_level1_name,nuts_level2_code,nuts_level2_name,nuts_level3_code,nuts_level3_name,geonames_admin1_code,geonames_admin1_name,geonames_admin1_ascii_name,geonames_admin2_code,geonames_admin2_name,geonames_admin2_ascii_name) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/geonames.csv' CSV HEADER;
|
||||
COPY grid_addresses (grid_id,line_1,line_2,line_3,lat,lng,postcode,is_primary,city,state,state_code,country,country_code,geonames_city_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/addresses.csv' CSV HEADER;
|
||||
COPY grid_external_ids (grid_id,external_id_type,external_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/external_ids.csv' CSV HEADER;
|
||||
COPY grid_labels (grid_id,iso639,label) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/labels.csv' CSV HEADER;
|
||||
COPY grid_relationships (grid_id,relationship_type,related_grid_id) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/relationships.csv' CSV HEADER;
|
||||
COPY grid_types (grid_id,type) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/types.csv' CSV HEADER;
|
||||
COPY grid_links (grid_id,link) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/links.csv' CSV HEADER;
|
||||
COPY grid_acronyms (grid_id,acronym) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/acronyms.csv' CSV HEADER;
|
||||
COPY grid_aliases (grid_id,alias) FROM '/Users/michele/Develop/dnet45/dnet-orgs-database-application/data/grid-2019-05-06/full_tables/aliases.csv' CSV HEADER;
|
||||
COPY grid_institutes (grid_id,name,wikipedia_url,email_address,established) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/institutes.csv' CSV HEADER;
|
||||
COPY grid_geonames (geonames_city_id,city,nuts_level1_code,nuts_level1_name,nuts_level2_code,nuts_level2_name,nuts_level3_code,nuts_level3_name,geonames_admin1_code,geonames_admin1_name,geonames_admin1_ascii_name,geonames_admin2_code,geonames_admin2_name,geonames_admin2_ascii_name) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/geonames.csv' CSV HEADER;
|
||||
COPY grid_addresses (grid_id,line_1,line_2,line_3,lat,lng,postcode,is_primary,city,state,state_code,country,country_code,geonames_city_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/addresses.csv' CSV HEADER;
|
||||
COPY grid_external_ids (grid_id,external_id_type,external_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/external_ids.csv' CSV HEADER;
|
||||
COPY grid_labels (grid_id,iso639,label) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/labels.csv' CSV HEADER;
|
||||
COPY grid_relationships (grid_id,relationship_type,related_grid_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/relationships.csv' CSV HEADER;
|
||||
COPY grid_types (grid_id,type) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/types.csv' CSV HEADER;
|
||||
COPY grid_links (grid_id,link) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/links.csv' CSV HEADER;
|
||||
COPY grid_acronyms (grid_id,acronym) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/acronyms.csv' CSV HEADER;
|
||||
COPY grid_aliases (grid_id,alias) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/grid-2020-06-29/full_tables/aliases.csv' CSV HEADER;
|
||||
|
||||
|
||||
INSERT INTO organizations(id, name, type, lat, lng, city, country, created_by, modified_by) (SELECT 'tmp::'||md5(o.grid_id), o.name, COALESCE(t.type, 'UNKNOWN'), a.lat, a.lng, a.city, a.country_code, 'import:grid.ac', 'import:grid.ac' FROM grid_institutes o LEFT OUTER JOIN grid_addresses a ON (o.grid_id=a.grid_id) LEFT OUTER JOIN grid_types t ON (o.grid_id=t.grid_id)) ON CONFLICT DO NOTHING;
|
||||
INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), grid_id, 'grid.ac' FROM grid_institutes ) ON CONFLICT DO NOTHING;
|
||||
INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), grid_id, 'GRID' FROM grid_institutes ) ON CONFLICT DO NOTHING;
|
||||
INSERT INTO other_ids (id, otherid, type) (SELECT 'tmp::'||md5(grid_id), external_id, external_id_type FROM grid_external_ids ) ON CONFLICT DO NOTHING;
|
||||
INSERT INTO other_names (id, lang, name) (SELECT 'tmp::'||md5(grid_id), 'en', name FROM grid_institutes ) ON CONFLICT DO NOTHING;
|
||||
INSERT INTO other_names (id, lang, name) (SELECT 'tmp::'||md5(grid_id), iso639, label FROM grid_labels ) ON CONFLICT DO NOTHING;
|
|
@ -1,10 +1,10 @@
|
|||
DROP VIEW organizations_view;
|
||||
DROP VIEW organizations_info_view;
|
||||
DROP VIEW organizations_simple_view;
|
||||
DROP VIEW users_view;
|
||||
DROP VIEW conflict_groups_view;
|
||||
DROP VIEW suggestions_info_by_country_view;
|
||||
DROP VIEW duplicate_groups_view
|
||||
DROP VIEW IF EXISTS organizations_view;
|
||||
DROP VIEW IF EXISTS organizations_info_view;
|
||||
DROP VIEW IF EXISTS organizations_simple_view;
|
||||
DROP VIEW IF EXISTS users_view;
|
||||
DROP VIEW IF EXISTS conflict_groups_view;
|
||||
DROP VIEW IF EXISTS suggestions_info_by_country_view;
|
||||
DROP VIEW IF EXISTS duplicate_groups_view;
|
||||
|
||||
DROP TABLE IF EXISTS other_ids;
|
||||
DROP TABLE IF EXISTS other_names;
|
||||
|
@ -30,14 +30,16 @@ CREATE TABLE org_types (val text PRIMARY KEY);
|
|||
INSERT INTO org_types VALUES ('Archive'), ('Company'), ('Education'), ('Facility'), ('Government'), ('Healthcare'), ('Nonprofit'), ('Other'), ('UNKNOWN');
|
||||
|
||||
CREATE TABLE id_types (val text PRIMARY KEY);
|
||||
INSERT INTO id_types VALUES ('CNRS'), ('FundRef'), ('HESA'), ('ISNI'), ('OrgRef'), ('UCAS'), ('UKPRN'), ('Wikidata'), ('grid.ac');
|
||||
INSERT INTO id_types VALUES ('CNRS'), ('FundRef'), ('HESA'), ('ISNI'), ('LinkedIn'), ('OrgRef'), ('UCAS'), ('UKPRN'), ('Wikidata'), ('GRID'), ('ROR');
|
||||
|
||||
CREATE TABLE languages (val text PRIMARY KEY);
|
||||
INSERT INTO languages VALUES ('UNKNOWN'), ('aa'), ('af'), ('am'), ('ar'), ('as'), ('az'), ('ba'), ('be'), ('bg'), ('bn'), ('br'), ('bs'), ('ca'), ('ch'), ('co'), ('cs'), ('cy'), ('da'),
|
||||
('de'), ('dv'), ('dz'), ('el'), ('en'), ('eo'), ('es'), ('et'), ('eu'), ('fa'), ('fi'), ('fr'), ('fy'), ('ga'), ('gd'), ('gl'), ('gu'), ('he'), ('hi'), ('hr'), ('hu'), ('hy'), ('id'), ('is'),
|
||||
('it'), ('iu'), ('ja'), ('jv'), ('ka'), ('kk'), ('kl'), ('km'), ('kn'), ('ko'), ('ku'), ('ky'), ('la'), ('lb'), ('lo'), ('lt'), ('lv'), ('mg'), ('mi'), ('mk'), ('ml'), ('mn'), ('mr'), ('ms'),
|
||||
('mt'), ('my'), ('nb'), ('ne'), ('nl'), ('nn'), ('no'), ('oc'), ('or'), ('pa'), ('pl'), ('ps'), ('pt'), ('rm'), ('ro'), ('ru'), ('rw'), ('sa'), ('sd'), ('si'), ('sk'), ('sl'), ('so'), ('sq'),
|
||||
('sr'), ('sv'), ('sw'), ('ta'), ('te'), ('tg'), ('th'), ('tk'), ('tl'), ('tr'), ('tt'), ('ug'), ('uk'), ('ur'), ('uz'), ('vi'), ('xh'), ('yo'), ('zh'), ('zu');
|
||||
INSERT INTO languages VALUES ('UNKNOWN'),
|
||||
('aa'),('af'),('am'),('ar'),('as'),('az'),('ba'),('be'),('bg'),('bn'),('br'),('bs'),('ca'),('ch'),('co'),('cs'),('cy'),('da'),('de'),('dv'),
|
||||
('dz'),('el'),('en'),('eo'),('es'),('et'),('eu'),('fa'),('fi'),('fo'),('fr'),('fy'),('ga'),('gd'),('gl'),('gu'),('he'),('hi'),('hr'),('ht'),
|
||||
('hu'),('hy'),('id'),('is'),('it'),('iu'),('ja'),('jv'),('ka'),('kk'),('kl'),('km'),('kn'),('ko'),('ku'),('ky'),('la'),('lb'),('lo'),('lt'),
|
||||
('lv'),('mg'),('mi'),('mk'),('ml'),('mn'),('mr'),('ms'),('mt'),('my'),('nb'),('ne'),('nl'),('no'),('oc'),('om'),('or'),('pa'),('pl'),('ps'),
|
||||
('pt'),('rm'),('ro'),('ru'),('rw'),('sa'),('sd'),('si'),('sk'),('sl'),('sm'),('so'),('sq'),('sr'),('sv'),('sw'),('ta'),('te'),('tg'),('th'),
|
||||
('tk'),('tl'),('tr'),('tt'),('ug'),('uk'),('ur'),('uz'),('vi'),('xh'),('yo'),('zh'),('zu');
|
||||
|
||||
CREATE TABLE countries (val text PRIMARY KEY);
|
||||
INSERT INTO countries VALUES ('UNKNOWN'), ('AD'), ('AE'), ('AF'), ('AG'), ('AL'), ('AM'), ('AO'), ('AR'), ('AT'), ('AU'), ('AW'), ('AX'), ('AZ'), ('BA'), ('BB'), ('BD'), ('BE'), ('BF'), ('BG'), ('BH'), ('BI'),
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
CREATE TEMPORARY TABLE temp_ror_grid (
|
||||
ror_id text,
|
||||
grid_id text
|
||||
);
|
||||
|
||||
COPY temp_ror_grid (ror_id, grid_id) FROM '/Users/michele/Develop/dnet45/dnet-applications/apps/dnet-orgs-database-application/data/ror_grid.tsv' DELIMITER E'\t';
|
||||
|
||||
|
||||
INSERT INTO other_ids(id, otherid, type) (SELECT
|
||||
o.id as id,
|
||||
ror.ror_id as otherid,
|
||||
'ROR' as type
|
||||
FROM
|
||||
organizations o
|
||||
JOIN other_ids oth ON (o.id = oth.id AND oth.type = 'GRID')
|
||||
JOIN temp_ror_grid ror ON (oth.otherid = ror.grid_id)
|
||||
) ON CONFLICT DO NOTHING;
|
Loading…
Reference in New Issue