From 5f18425c66d176af1a095bd3fae7be45432b44db Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 15 Jul 2020 15:52:58 +0200 Subject: [PATCH] script to import organizations from ROR --- .../resources/scripts/prepare_import_ror.pl | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl diff --git a/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl b/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl new file mode 100644 index 00000000..9c86f567 --- /dev/null +++ b/apps/dnet-orgs-database-application/src/main/resources/scripts/prepare_import_ror.pl @@ -0,0 +1,191 @@ +#!/usr/bin/perl + +use File::Path 'make_path'; +use Digest::MD5 qw(md5_hex); +use JSON::Parse 'json_file_to_perl'; +use Data::Dumper; +use strict; +use utf8; + +binmode(STDOUT, ":utf8"); + +# THE LATEST VERSION OF row.json IS AVAILABLE AT https://figshare.com/collections/ROR_Data/4596503 + +my $inputFile = '../../../../data/ror.json'; +my $outputDir = '../../../../data/ror_tables'; + +make_path $outputDir or die "Failed to create path: $outputDir" unless (-d $outputDir); + +my $data = json_file_to_perl($inputFile); + +open(my $OUT_ORGS , ">$outputDir/organizations.tsv") or die("Can't open an output file"); +open(my $OUT_OTHER_IDS , ">$outputDir/other_ids.tsv") or die("Can't open an output file"); +open(my $OUT_OTHER_NAMES , ">$outputDir/other_names.tsv") or die("Can't open an output file"); +open(my $OUT_ACRONYMS , ">$outputDir/acronyms.tsv") or die("Can't open an output file"); +open(my $OUT_RELS , ">$outputDir/relationships.tsv") or die("Can't open an output file"); +open(my $OUT_URLS , ">$outputDir/urls.tsv") or die("Can't open an output file"); + +binmode($OUT_ORGS, ":utf8"); +binmode($OUT_OTHER_IDS, ":utf8"); +binmode($OUT_OTHER_NAMES, ":utf8"); +binmode($OUT_ACRONYMS, ":utf8"); +binmode($OUT_RELS, ":utf8"); +binmode($OUT_URLS, ":utf8"); + +foreach my $record (@$data) { + if ($record->{'id'}) { + my $id = 'tmp::' . md5_hex($record->{'id'}); + +# print Dumper $record; +# die; + + write_orgs($id, $record, $OUT_ORGS); + write_other_ids($id, $record, $OUT_OTHER_IDS); + write_other_names($id, $record, $OUT_OTHER_NAMES); + write_acronyms($id, $record, $OUT_ACRONYMS); + write_rels($id, $record, $OUT_RELS); + write_urls($id, $record, $OUT_URLS); + } +} + +close($OUT_ORGS); +close($OUT_OTHER_IDS); +close($OUT_OTHER_NAMES); +close($OUT_ACRONYMS); +close($OUT_RELS); +close($OUT_URLS); + +print "\nDone.\n\n"; + +sub write_orgs { + my ($id, $record, $OUT) = @_; + + + print $OUT $id; + print $OUT "\t"; + print $OUT $record->{'name'}; + print $OUT "\t"; + print $OUT getFirstArrayElem($record->{'types'}, 'UNKNOWN'); + print $OUT "\t"; + print $OUT ""; # lat - TODO MISSING + print $OUT "\t"; + print $OUT ""; # lng - TODO MISSING + print $OUT "\t"; + print $OUT ""; # city - TODO MISSING + print $OUT "\t"; + print $OUT $record->{'country'}->{'country_code'}; + print $OUT "\t"; + print $OUT "import:ror"; # created_by + print $OUT "\t"; + print $OUT "import:ror"; # modified_by + print $OUT "\n"; +} + +sub write_other_ids { + my ($id, $record, $OUT) = @_; + + _write_other_ids($id, $record->{'id'}, 'ror', $OUT); + + while (my ($type, $v) = each (%{$record->{'external_ids'}})) { + if ($type eq 'GRID') { + _write_other_ids($id, $v->{'all'}, 'grid.ac', $OUT); + } else { + foreach my $other (@{$v->{'all'}}) { + _write_other_ids($id, $other, $type, $OUT); + } + } + } +} + + +sub _write_other_ids { + my ($id, $other, $type, $OUT) = @_; + + if ($other) { + print $OUT $id; + print $OUT "\t"; + print $OUT $other; + print $OUT "\t"; + print $OUT $type; + print $OUT "\n"; + } +} + + + + +sub write_other_names { + my ($id, $record, $OUT) = @_; + + _write_other_names($id, $record->{'name'}, 'en', $OUT); + + foreach my $alias (@{$record->{'aliases'}}) { + _write_other_names($id, $alias, 'UNKNOWN', $OUT); + } + + foreach my $l (@{$record->{'labels'}}) { + _write_other_names($id, $l->{label}, $l->{'iso639'}, $OUT); + } + +} + +sub _write_other_names { + my ($id, $name, $lang, $OUT) = @_; + + if ($name) { + print $OUT $id; + print $OUT "\t"; + print $OUT $name; + print $OUT "\t"; + print $OUT $lang; + print $OUT "\n"; + } +} + +sub write_acronyms { + my ($id, $record, $OUT) = @_; + + foreach my $acr (@{$record->{'acronyms'}}) { + print $OUT $id; + print $OUT "\t"; + print $OUT $acr; + print $OUT "\n"; + } +} + + +sub write_rels { + my ($id, $record, $OUT) = @_; + + print $OUT $id; + print $OUT "\t"; + print $OUT ""; # reltype - TODO + print $OUT "\t"; + print $OUT ""; # id2 - TODO # Example: 'tmp::'||md5(o.grid_id) + print $OUT "\n"; +} + +sub write_urls { + my ($id, $record, $OUT) = @_; + + + foreach my $url (@{$record->{'links'}}) { + print $OUT $id; + print $OUT "\t"; + print $OUT $url; + print $OUT "\n"; + } +} + +sub getFirstArrayElem { + my ($arr, $default) = @_; + + if (@$arr) { + return @$arr[0]; + } else { + return $default; + } +} + + +1;