script to import organizations from ROR
This commit is contained in:
parent
25e57d5541
commit
5f18425c66
|
@ -0,0 +1,191 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use File::Path 'make_path';
|
||||
use Digest::MD5 qw(md5_hex);
|
||||
use JSON::Parse 'json_file_to_perl';
|
||||
use Data::Dumper;
|
||||
use strict;
|
||||
use utf8;
|
||||
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
# THE LATEST VERSION OF row.json IS AVAILABLE AT https://figshare.com/collections/ROR_Data/4596503
|
||||
|
||||
my $inputFile = '../../../../data/ror.json';
|
||||
my $outputDir = '../../../../data/ror_tables';
|
||||
|
||||
make_path $outputDir or die "Failed to create path: $outputDir" unless (-d $outputDir);
|
||||
|
||||
my $data = json_file_to_perl($inputFile);
|
||||
|
||||
open(my $OUT_ORGS , ">$outputDir/organizations.tsv") or die("Can't open an output file");
|
||||
open(my $OUT_OTHER_IDS , ">$outputDir/other_ids.tsv") or die("Can't open an output file");
|
||||
open(my $OUT_OTHER_NAMES , ">$outputDir/other_names.tsv") or die("Can't open an output file");
|
||||
open(my $OUT_ACRONYMS , ">$outputDir/acronyms.tsv") or die("Can't open an output file");
|
||||
open(my $OUT_RELS , ">$outputDir/relationships.tsv") or die("Can't open an output file");
|
||||
open(my $OUT_URLS , ">$outputDir/urls.tsv") or die("Can't open an output file");
|
||||
|
||||
binmode($OUT_ORGS, ":utf8");
|
||||
binmode($OUT_OTHER_IDS, ":utf8");
|
||||
binmode($OUT_OTHER_NAMES, ":utf8");
|
||||
binmode($OUT_ACRONYMS, ":utf8");
|
||||
binmode($OUT_RELS, ":utf8");
|
||||
binmode($OUT_URLS, ":utf8");
|
||||
|
||||
foreach my $record (@$data) {
|
||||
if ($record->{'id'}) {
|
||||
my $id = 'tmp::' . md5_hex($record->{'id'});
|
||||
|
||||
# print Dumper $record;
|
||||
# die;
|
||||
|
||||
write_orgs($id, $record, $OUT_ORGS);
|
||||
write_other_ids($id, $record, $OUT_OTHER_IDS);
|
||||
write_other_names($id, $record, $OUT_OTHER_NAMES);
|
||||
write_acronyms($id, $record, $OUT_ACRONYMS);
|
||||
write_rels($id, $record, $OUT_RELS);
|
||||
write_urls($id, $record, $OUT_URLS);
|
||||
}
|
||||
}
|
||||
|
||||
close($OUT_ORGS);
|
||||
close($OUT_OTHER_IDS);
|
||||
close($OUT_OTHER_NAMES);
|
||||
close($OUT_ACRONYMS);
|
||||
close($OUT_RELS);
|
||||
close($OUT_URLS);
|
||||
|
||||
print "\nDone.\n\n";
|
||||
|
||||
sub write_orgs {
|
||||
my ($id, $record, $OUT) = @_;
|
||||
|
||||
|
||||
print $OUT $id;
|
||||
print $OUT "\t";
|
||||
print $OUT $record->{'name'};
|
||||
print $OUT "\t";
|
||||
print $OUT getFirstArrayElem($record->{'types'}, 'UNKNOWN');
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # lat - TODO MISSING
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # lng - TODO MISSING
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # city - TODO MISSING
|
||||
print $OUT "\t";
|
||||
print $OUT $record->{'country'}->{'country_code'};
|
||||
print $OUT "\t";
|
||||
print $OUT "import:ror"; # created_by
|
||||
print $OUT "\t";
|
||||
print $OUT "import:ror"; # modified_by
|
||||
print $OUT "\n";
|
||||
}
|
||||
|
||||
sub write_other_ids {
|
||||
my ($id, $record, $OUT) = @_;
|
||||
|
||||
_write_other_ids($id, $record->{'id'}, 'ror', $OUT);
|
||||
|
||||
while (my ($type, $v) = each (%{$record->{'external_ids'}})) {
|
||||
if ($type eq 'GRID') {
|
||||
_write_other_ids($id, $v->{'all'}, 'grid.ac', $OUT);
|
||||
} else {
|
||||
foreach my $other (@{$v->{'all'}}) {
|
||||
_write_other_ids($id, $other, $type, $OUT);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub _write_other_ids {
|
||||
my ($id, $other, $type, $OUT) = @_;
|
||||
|
||||
if ($other) {
|
||||
print $OUT $id;
|
||||
print $OUT "\t";
|
||||
print $OUT $other;
|
||||
print $OUT "\t";
|
||||
print $OUT $type;
|
||||
print $OUT "\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
sub write_other_names {
|
||||
my ($id, $record, $OUT) = @_;
|
||||
|
||||
_write_other_names($id, $record->{'name'}, 'en', $OUT);
|
||||
|
||||
foreach my $alias (@{$record->{'aliases'}}) {
|
||||
_write_other_names($id, $alias, 'UNKNOWN', $OUT);
|
||||
}
|
||||
|
||||
foreach my $l (@{$record->{'labels'}}) {
|
||||
_write_other_names($id, $l->{label}, $l->{'iso639'}, $OUT);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
sub _write_other_names {
|
||||
my ($id, $name, $lang, $OUT) = @_;
|
||||
|
||||
if ($name) {
|
||||
print $OUT $id;
|
||||
print $OUT "\t";
|
||||
print $OUT $name;
|
||||
print $OUT "\t";
|
||||
print $OUT $lang;
|
||||
print $OUT "\n";
|
||||
}
|
||||
}
|
||||
|
||||
sub write_acronyms {
|
||||
my ($id, $record, $OUT) = @_;
|
||||
|
||||
foreach my $acr (@{$record->{'acronyms'}}) {
|
||||
print $OUT $id;
|
||||
print $OUT "\t";
|
||||
print $OUT $acr;
|
||||
print $OUT "\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub write_rels {
|
||||
my ($id, $record, $OUT) = @_;
|
||||
|
||||
print $OUT $id;
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # reltype - TODO
|
||||
print $OUT "\t";
|
||||
print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id)
|
||||
print $OUT "\n";
|
||||
}
|
||||
|
||||
sub write_urls {
|
||||
my ($id, $record, $OUT) = @_;
|
||||
|
||||
|
||||
foreach my $url (@{$record->{'links'}}) {
|
||||
print $OUT $id;
|
||||
print $OUT "\t";
|
||||
print $OUT $url;
|
||||
print $OUT "\n";
|
||||
}
|
||||
}
|
||||
|
||||
sub getFirstArrayElem {
|
||||
my ($arr, $default) = @_;
|
||||
|
||||
if (@$arr) {
|
||||
return @$arr[0];
|
||||
} else {
|
||||
return $default;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
1;
|
Loading…
Reference in New Issue