script to import organizations from ROR

This commit is contained in:
Michele Artini 2020-07-15 15:52:58 +02:00
parent 25e57d5541
commit 5f18425c66
1 changed files with 191 additions and 0 deletions

View File

@ -0,0 +1,191 @@
#!/usr/bin/perl
use File::Path 'make_path';
use Digest::MD5 qw(md5_hex);
use JSON::Parse 'json_file_to_perl';
use Data::Dumper;
use strict;
use utf8;
binmode(STDOUT, ":utf8");
# THE LATEST VERSION OF row.json IS AVAILABLE AT https://figshare.com/collections/ROR_Data/4596503
my $inputFile = '../../../../data/ror.json';
my $outputDir = '../../../../data/ror_tables';
make_path $outputDir or die "Failed to create path: $outputDir" unless (-d $outputDir);
my $data = json_file_to_perl($inputFile);
open(my $OUT_ORGS , ">$outputDir/organizations.tsv") or die("Can't open an output file");
open(my $OUT_OTHER_IDS , ">$outputDir/other_ids.tsv") or die("Can't open an output file");
open(my $OUT_OTHER_NAMES , ">$outputDir/other_names.tsv") or die("Can't open an output file");
open(my $OUT_ACRONYMS , ">$outputDir/acronyms.tsv") or die("Can't open an output file");
open(my $OUT_RELS , ">$outputDir/relationships.tsv") or die("Can't open an output file");
open(my $OUT_URLS , ">$outputDir/urls.tsv") or die("Can't open an output file");
binmode($OUT_ORGS, ":utf8");
binmode($OUT_OTHER_IDS, ":utf8");
binmode($OUT_OTHER_NAMES, ":utf8");
binmode($OUT_ACRONYMS, ":utf8");
binmode($OUT_RELS, ":utf8");
binmode($OUT_URLS, ":utf8");
foreach my $record (@$data) {
if ($record->{'id'}) {
my $id = 'tmp::' . md5_hex($record->{'id'});
# print Dumper $record;
# die;
write_orgs($id, $record, $OUT_ORGS);
write_other_ids($id, $record, $OUT_OTHER_IDS);
write_other_names($id, $record, $OUT_OTHER_NAMES);
write_acronyms($id, $record, $OUT_ACRONYMS);
write_rels($id, $record, $OUT_RELS);
write_urls($id, $record, $OUT_URLS);
}
}
close($OUT_ORGS);
close($OUT_OTHER_IDS);
close($OUT_OTHER_NAMES);
close($OUT_ACRONYMS);
close($OUT_RELS);
close($OUT_URLS);
print "\nDone.\n\n";
sub write_orgs {
my ($id, $record, $OUT) = @_;
print $OUT $id;
print $OUT "\t";
print $OUT $record->{'name'};
print $OUT "\t";
print $OUT getFirstArrayElem($record->{'types'}, 'UNKNOWN');
print $OUT "\t";
print $OUT ""; # lat - TODO MISSING
print $OUT "\t";
print $OUT ""; # lng - TODO MISSING
print $OUT "\t";
print $OUT ""; # city - TODO MISSING
print $OUT "\t";
print $OUT $record->{'country'}->{'country_code'};
print $OUT "\t";
print $OUT "import:ror"; # created_by
print $OUT "\t";
print $OUT "import:ror"; # modified_by
print $OUT "\n";
}
sub write_other_ids {
my ($id, $record, $OUT) = @_;
_write_other_ids($id, $record->{'id'}, 'ror', $OUT);
while (my ($type, $v) = each (%{$record->{'external_ids'}})) {
if ($type eq 'GRID') {
_write_other_ids($id, $v->{'all'}, 'grid.ac', $OUT);
} else {
foreach my $other (@{$v->{'all'}}) {
_write_other_ids($id, $other, $type, $OUT);
}
}
}
}
sub _write_other_ids {
my ($id, $other, $type, $OUT) = @_;
if ($other) {
print $OUT $id;
print $OUT "\t";
print $OUT $other;
print $OUT "\t";
print $OUT $type;
print $OUT "\n";
}
}
sub write_other_names {
my ($id, $record, $OUT) = @_;
_write_other_names($id, $record->{'name'}, 'en', $OUT);
foreach my $alias (@{$record->{'aliases'}}) {
_write_other_names($id, $alias, 'UNKNOWN', $OUT);
}
foreach my $l (@{$record->{'labels'}}) {
_write_other_names($id, $l->{label}, $l->{'iso639'}, $OUT);
}
}
sub _write_other_names {
my ($id, $name, $lang, $OUT) = @_;
if ($name) {
print $OUT $id;
print $OUT "\t";
print $OUT $name;
print $OUT "\t";
print $OUT $lang;
print $OUT "\n";
}
}
sub write_acronyms {
my ($id, $record, $OUT) = @_;
foreach my $acr (@{$record->{'acronyms'}}) {
print $OUT $id;
print $OUT "\t";
print $OUT $acr;
print $OUT "\n";
}
}
sub write_rels {
my ($id, $record, $OUT) = @_;
print $OUT $id;
print $OUT "\t";
print $OUT ""; # reltype - TODO
print $OUT "\t";
print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id)
print $OUT "\n";
}
sub write_urls {
my ($id, $record, $OUT) = @_;
foreach my $url (@{$record->{'links'}}) {
print $OUT $id;
print $OUT "\t";
print $OUT $url;
print $OUT "\n";
}
}
sub getFirstArrayElem {
my ($arr, $default) = @_;
if (@$arr) {
return @$arr[0];
} else {
return $default;
}
}
1;