dnet-applications/apps/dnet-orgs-database-application/scripts/prepare_import_ror.pl

191 lines
4.1 KiB
Perl
Raw Normal View History

#!/usr/bin/perl
2020-07-16 16:11:24 +02:00
# NOTE: this script MUST NOT BE USED, it will be used only if grid.ac will be deprecated
use File::Path 'make_path';
use Digest::MD5 qw(md5_hex);
use JSON::Parse 'json_file_to_perl';
use Data::Dumper;
use strict;
use utf8;
binmode(STDOUT, ":utf8");
# THE LATEST VERSION OF row.json IS AVAILABLE AT https://figshare.com/collections/ROR_Data/4596503
my $inputFile = '../../../../data/ror.json';
my $outputDir = '../../../../data/ror_tables';
make_path $outputDir or die "Failed to create path: $outputDir" unless (-d $outputDir);
my $data = json_file_to_perl($inputFile);
open(my $OUT_ORGS , ">$outputDir/organizations.tsv") or die("Can't open an output file");
open(my $OUT_OTHER_IDS , ">$outputDir/other_ids.tsv") or die("Can't open an output file");
open(my $OUT_OTHER_NAMES , ">$outputDir/other_names.tsv") or die("Can't open an output file");
open(my $OUT_ACRONYMS , ">$outputDir/acronyms.tsv") or die("Can't open an output file");
open(my $OUT_RELS , ">$outputDir/relationships.tsv") or die("Can't open an output file");
open(my $OUT_URLS , ">$outputDir/urls.tsv") or die("Can't open an output file");
binmode($OUT_ORGS, ":utf8");
binmode($OUT_OTHER_IDS, ":utf8");
binmode($OUT_OTHER_NAMES, ":utf8");
binmode($OUT_ACRONYMS, ":utf8");
binmode($OUT_RELS, ":utf8");
binmode($OUT_URLS, ":utf8");
foreach my $record (@$data) {
2020-07-16 16:11:24 +02:00
my $id = 'tmp::' . md5_hex($record->{'id'});
write_orgs($id, $record, $OUT_ORGS);
write_other_ids($id, $record, $OUT_OTHER_IDS);
write_other_names($id, $record, $OUT_OTHER_NAMES);
write_acronyms($id, $record, $OUT_ACRONYMS);
# write_rels($id, $record, $OUT_RELS);
write_urls($id, $record, $OUT_URLS);
}
close($OUT_ORGS);
close($OUT_OTHER_IDS);
close($OUT_OTHER_NAMES);
close($OUT_ACRONYMS);
close($OUT_RELS);
close($OUT_URLS);
print "\nDone.\n\n";
sub write_orgs {
my ($id, $record, $OUT) = @_;
print $OUT $id;
print $OUT "\t";
print $OUT $record->{'name'};
print $OUT "\t";
print $OUT getFirstArrayElem($record->{'types'}, 'UNKNOWN');
print $OUT "\t";
2020-07-16 16:11:24 +02:00
print $OUT 0; # lat - TODO MISSING
print $OUT "\t";
2020-07-16 16:11:24 +02:00
print $OUT 0; # lng - TODO MISSING
print $OUT "\t";
print $OUT ""; # city - TODO MISSING
print $OUT "\t";
print $OUT $record->{'country'}->{'country_code'};
print $OUT "\t";
print $OUT "import:ror"; # created_by
print $OUT "\t";
print $OUT "import:ror"; # modified_by
print $OUT "\n";
}
sub write_other_ids {
my ($id, $record, $OUT) = @_;
_write_other_ids($id, $record->{'id'}, 'ror', $OUT);
while (my ($type, $v) = each (%{$record->{'external_ids'}})) {
2020-07-16 16:11:24 +02:00
my $all = $v->{'all'};
if (ref $all eq 'ARRAY') {
foreach my $other (@$all) {
_write_other_ids($id, $other, $type, $OUT);
}
2020-07-16 16:11:24 +02:00
} else {
_write_other_ids($id, $all, $type, $OUT);
}
}
}
sub _write_other_ids {
my ($id, $other, $type, $OUT) = @_;
if ($other) {
print $OUT $id;
print $OUT "\t";
print $OUT $other;
print $OUT "\t";
print $OUT $type;
print $OUT "\n";
}
}
sub write_other_names {
my ($id, $record, $OUT) = @_;
_write_other_names($id, $record->{'name'}, 'en', $OUT);
foreach my $alias (@{$record->{'aliases'}}) {
_write_other_names($id, $alias, 'UNKNOWN', $OUT);
}
foreach my $l (@{$record->{'labels'}}) {
_write_other_names($id, $l->{label}, $l->{'iso639'}, $OUT);
}
}
sub _write_other_names {
my ($id, $name, $lang, $OUT) = @_;
if ($name) {
print $OUT $id;
print $OUT "\t";
print $OUT $name;
print $OUT "\t";
print $OUT $lang;
print $OUT "\n";
}
}
sub write_acronyms {
my ($id, $record, $OUT) = @_;
foreach my $acr (@{$record->{'acronyms'}}) {
print $OUT $id;
print $OUT "\t";
print $OUT $acr;
print $OUT "\n";
}
}
sub write_rels {
my ($id, $record, $OUT) = @_;
2020-07-16 16:11:24 +02:00
# print $OUT $id;
# print $OUT "\t";
# print $OUT ""; # reltype - TODO
# print $OUT "\t";
# print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id)
# print $OUT "\n";
}
sub write_urls {
my ($id, $record, $OUT) = @_;
foreach my $url (@{$record->{'links'}}) {
print $OUT $id;
print $OUT "\t";
print $OUT $url;
print $OUT "\n";
}
}
sub getFirstArrayElem {
my ($arr, $default) = @_;
if (@$arr) {
return @$arr[0];
} else {
return $default;
}
}
1;