added script to regenerate hostedBy Map following instruction defined on ticket #7539

updated hosted By Map
This commit is contained in:
Sandro La Bruzzo 2022-02-15 11:05:27 +01:00
parent 90e197a563
commit 3aa2020b24
2 changed files with 1103 additions and 25 deletions

View File

@ -0,0 +1,63 @@
from urllib.request import urlopen
import json
def retrieve_datacite_clients(base_url):
datacite_clients = {}
while base_url is not None:
with urlopen(base_url) as response:
print(f"requesting {base_url}")
response_content = response.read()
data = json.loads(response_content)
if 'data' in data and len(data['data'])>0:
for item in data['data']:
datacite_clients[item['id'].lower()]= item['attributes']['re3data'].lower().replace("https://doi.org/","")
base_url = data['links']['next']
else:
base_url = None
return datacite_clients
def retrieve_r3data(start_url):
r3data_clients = {}
page_number = 1
base_url = start_url
while base_url is not None:
with urlopen(base_url) as response:
print(f"requesting {base_url}")
response_content = response.read()
data = json.loads(response_content)
if 'data' in data and len(data['data'])>0:
for item in data['data']:
r3data_clients[item['id'].lower()]= dict(
openaire_id= "re3data_____::"+item['attributes']['re3dataId'].lower(),
official_name=item['attributes']['repositoryName']
)
page_number +=1
base_url = f"{start_url}&page[number]={page_number}"
else:
base_url = None
return r3data_clients
base_url ="https://api.datacite.org/clients?query=re3data_id:*&page[size]=250"
dc = retrieve_datacite_clients(base_url)
r3 = retrieve_r3data("https://api.datacite.org/re3data?page[size]=250")
result = {}
for item in dc:
res = dc[item].lower()
if res not in r3:
print(f"missing {res} for {item} in dictionary")
else:
result[item.upper()]= dict(openaire_id=r3[res]["openaire_id"],datacite_name=r3[res]["official_name"], official_name=r3[res]["official_name"] )
with open('hostedBy_map.json', 'w', encoding='utf8') as json_file:
json.dump(result, json_file, ensure_ascii=False, indent=1)