added datasets

This commit is contained in:
Andrea Mannocci 2021-07-22 11:03:05 +02:00
parent c052601c90
commit c6d01322c3
13 changed files with 40897 additions and 268 deletions

2
.gitignore vendored
View File

@ -76,7 +76,7 @@ target/
.ipynb_checkpoints/
# exclude data from source control by default
/data/
# /data/
# Mac OS-specific storage files
.DS_Store

0
data/external/.gitkeep vendored Normal file
View File

0
data/interim/.gitkeep Normal file
View File

0
data/processed/.gitkeep Normal file
View File

0
data/raw/.gitkeep Normal file
View File

File diff suppressed because it is too large Load Diff

20203
data/raw/export_roar_CSV.csv Normal file

File diff suppressed because one or more lines are too long

5708
data/raw/openDoar.tsv Normal file

File diff suppressed because it is too large Load Diff

2708
data/raw/re3data.tsv Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

32
notebooks/Untitled.ipynb Normal file
View File

@ -0,0 +1,32 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because one or more lines are too long

View File

@ -24,27 +24,6 @@ def main(input_filepath, output_filepath):
logger = logging.getLogger(__name__)
logger.info('making final data set from raw data')
with open(os.path.join(input_filepath, 'OpenAIRE_DS_re3data_opendoar.json'), mode='r') as f:
with open(os.path.join(output_filepath, 're3data_opendoar.csv'), mode='w') as csvfile:
csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(['id', 'url', 'official_name', 'english_name', 'description', 'latitude', 'longitude', 'subjects'])
for line in f:
repo = json.loads(line)
identifier = repo['id']
official_name = repo['officialname']['value']
url = get_value_or_none(repo, 'websiteurl')
english_name = get_value_or_none(repo, 'englishname')
description = get_value_or_none(repo, 'description')
latitude = get_value_or_none(repo, 'latitude')
longitude = get_value_or_none(repo, 'longitude')
subjects = []
for s in repo['subjects']:
subjects.append(s['value'])
csv_writer.writerow([identifier, url, official_name, english_name, description, latitude, longitude, subjects])
if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'