added opendoar dump; refactored dump scripts

This commit is contained in:
Andrea Mannocci 2023-02-01 12:44:32 +01:00
parent 85d2dc6cb7
commit a055b4247e
6 changed files with 6309 additions and 2113 deletions

3
.gitignore vendored
View File

@ -87,3 +87,6 @@ target/
# Mypy cache
.mypy_cache/
# Configurations
config.py

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,13 @@
import config
import requests
import json
import time
DATE = time.strftime('%Y-%m-%d')
url = "https://api.fairsharing.org/users/sign_in"
payload="{\"user\": {\"login\":\"andrea.mannocci@isti.cnr.it\",\"password\":\"\"} }"
payload="{\"user\": {\"login\":\"%s\",\"password\":\"%s\"} }" % (config.fairsharing_username, config.fairsharing_password)
headers = {
'Accept': 'application/json',
'Content-Type': 'application/json'
@ -22,13 +26,19 @@ headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + token,
}
print(headers)
# print(headers)
with open('./fairsharing.json', 'w') as file_out:
for page in range(1, 5):
url = 'https://api.fairsharing.org/databases/?page[number]=%s&page[size]=500' % page
with open('../../data/raw/fairsharing_dump_%s.json' % DATE, 'w') as file_out:
page = 1
size = 500
while(True):
url = 'https://api.fairsharing.org/databases/?page[number]=%s&page[size]=%s' % (page,size)
print(url)
response = requests.request("GET", url, headers=headers)
file_out.writelines('\n'.join([json.dumps(record) for record in response.json()['data']]))
file_out.write('\n')
file_out.write('\n')
if len(response.json()['data']) < size:
break
page += 1

21
src/data/dump_opendoar.py Executable file
View File

@ -0,0 +1,21 @@
import config
import requests
import json
import time
DATE = time.strftime('%Y-%m-%d')
with open('../../data/raw/opendoar_dump_%s.json' % DATE, 'w') as file_out:
size = 100
offset = 0
while(True):
url = 'https://v2.sherpa.ac.uk/cgi/retrieve/cgi/retrieve?item-type=repository&api-key=%s&format=Json&limit=%s&offset=%s' % (config.opendoar_apikey, size, offset)
print(url)
response = requests.request("GET", url)
file_out.writelines('\n'.join([json.dumps(record) for record in response.json()['items']]))
file_out.write('\n')
if len(response.json()['items']) < size:
break
offset += size