added opendoar dump; refactored dump scripts

This commit is contained in:
Andrea Mannocci 2023-02-01 12:44:32 +01:00
parent 85d2dc6cb7
commit a055b4247e
6 changed files with 6309 additions and 2113 deletions

3
.gitignore vendored
View File

@ -87,3 +87,6 @@ target/
# Mypy cache # Mypy cache
.mypy_cache/ .mypy_cache/
# Configurations
config.py

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,13 @@
import config
import requests import requests
import json import json
import time
DATE = time.strftime('%Y-%m-%d')
url = "https://api.fairsharing.org/users/sign_in" url = "https://api.fairsharing.org/users/sign_in"
payload="{\"user\": {\"login\":\"andrea.mannocci@isti.cnr.it\",\"password\":\"\"} }" payload="{\"user\": {\"login\":\"%s\",\"password\":\"%s\"} }" % (config.fairsharing_username, config.fairsharing_password)
headers = { headers = {
'Accept': 'application/json', 'Accept': 'application/json',
'Content-Type': 'application/json' 'Content-Type': 'application/json'
@ -22,13 +26,19 @@ headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': 'Bearer ' + token, 'Authorization': 'Bearer ' + token,
} }
print(headers) # print(headers)
with open('./fairsharing.json', 'w') as file_out: with open('../../data/raw/fairsharing_dump_%s.json' % DATE, 'w') as file_out:
for page in range(1, 5): page = 1
url = 'https://api.fairsharing.org/databases/?page[number]=%s&page[size]=500' % page size = 500
while(True):
url = 'https://api.fairsharing.org/databases/?page[number]=%s&page[size]=%s' % (page,size)
print(url) print(url)
response = requests.request("GET", url, headers=headers) response = requests.request("GET", url, headers=headers)
file_out.writelines('\n'.join([json.dumps(record) for record in response.json()['data']])) file_out.writelines('\n'.join([json.dumps(record) for record in response.json()['data']]))
file_out.write('\n') file_out.write('\n')
if len(response.json()['data']) < size:
break
page += 1

21
src/data/dump_opendoar.py Executable file
View File

@ -0,0 +1,21 @@
import config
import requests
import json
import time
DATE = time.strftime('%Y-%m-%d')
with open('../../data/raw/opendoar_dump_%s.json' % DATE, 'w') as file_out:
size = 100
offset = 0
while(True):
url = 'https://v2.sherpa.ac.uk/cgi/retrieve/cgi/retrieve?item-type=repository&api-key=%s&format=Json&limit=%s&offset=%s' % (config.opendoar_apikey, size, offset)
print(url)
response = requests.request("GET", url)
file_out.writelines('\n'.join([json.dumps(record) for record in response.json()['items']]))
file_out.write('\n')
if len(response.json()['items']) < size:
break
offset += size