using cookiecutter template

This commit is contained in:
Andrea Mannocci 2023-06-23 17:02:01 +02:00
parent d70036c5fe
commit f2c5e26266
7 changed files with 268 additions and 28 deletions

View File

@ -2,7 +2,15 @@ FROM jupyter/pyspark-notebook:latest
RUN pip install papermill
USER jovyan
RUN mkdir /home/jovyan/openaire
RUN mkdir /home/jovyan/openaire/data
RUN mkdir /home/jovyan/openaire/data/raw
ADD notebooks /home/jovyan/openaire/notebooks
ADD src /home/jovyan/openaire/src
ADD test_environment.py /home/jovyan/openaire/
ADD setup.py /home/jovyan/openaire/
ADD Makefile /home/jovyan/openaire
ADD requirements.txt /home/jovyan/openaire/
ADD notebooks/beginners_kit.ipynb /home/jovyan/openaire/

144
Makefile Normal file
View File

@ -0,0 +1,144 @@
.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
#################################################################################
# GLOBALS #
#################################################################################
PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
PROFILE = default
PROJECT_NAME = project_name
PYTHON_INTERPRETER = python3
ifeq (,$(shell which conda))
HAS_CONDA=False
else
HAS_CONDA=True
endif
#################################################################################
# COMMANDS #
#################################################################################
## Install Python Dependencies
requirements: test_environment
$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
## Make Dataset
data: requirements
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw
## Delete all compiled Python files
clean:
find . -type f -name "*.py[co]" -delete
find . -type d -name "__pycache__" -delete
## Lint using flake8
lint:
flake8 src
## Upload Data to S3
sync_data_to_s3:
ifeq (default,$(PROFILE))
aws s3 sync data/ s3://$(BUCKET)/data/
else
aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
endif
## Download Data from S3
sync_data_from_s3:
ifeq (default,$(PROFILE))
aws s3 sync s3://$(BUCKET)/data/ data/
else
aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
endif
## Set up python interpreter environment
create_environment:
ifeq (True,$(HAS_CONDA))
@echo ">>> Detected conda, creating conda environment."
ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
conda create --name $(PROJECT_NAME) python=3
else
conda create --name $(PROJECT_NAME) python=2.7
endif
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
else
$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
endif
## Test python environment is setup correctly
test_environment:
$(PYTHON_INTERPRETER) test_environment.py
#################################################################################
# PROJECT RULES #
#################################################################################
#################################################################################
# Self Documenting Commands #
#################################################################################
.DEFAULT_GOAL := help
# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
# sed script explained:
# /^##/:
# * save line in hold space
# * purge line
# * Loop:
# * append newline + line to hold space
# * go to next line
# * if line starts with doc comment, strip comment character off and loop
# * remove target prerequisites
# * append hold space (+ newline) to line
# * replace newline plus comments by `---`
# * print line
# Separate expressions are necessary because labels cannot be delimited by
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
.PHONY: help
help:
@echo "$$(tput bold)Available rules:$$(tput sgr0)"
@echo
@sed -n -e "/^## / { \
h; \
s/.*//; \
:doc" \
-e "H; \
n; \
s/^## //; \
t doc" \
-e "s/:.*//; \
G; \
s/\\n## /---/; \
s/\\n/ /g; \
p; \
}" ${MAKEFILE_LIST} \
| LC_ALL='C' sort --ignore-case \
| awk -F '---' \
-v ncol=$$(tput cols) \
-v indent=19 \
-v col_on="$$(tput setaf 6)" \
-v col_off="$$(tput sgr0)" \
'{ \
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
n = split($$2, words, " "); \
line_length = ncol - indent; \
for (i = 1; i <= n; i++) { \
line_length -= length(words[i]) + 1; \
if (line_length <= 0) { \
line_length = ncol - indent - length(words[i]) - 1; \
printf "\n%*s ", -indent, " "; \
} \
printf "%s ", words[i]; \
} \
printf "\n"; \
}' \
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')

View File

@ -29,31 +29,11 @@
"source": [
"This step can take some time depending on your network speed.\n",
"\n",
"Uncomment and run only if you need to download the data the first time: these lines just download the datasets from the deposition on Zenodo containing data for this kit (https://zenodo.org/record/7490192), untar the content and clean up. All the data needed will sit under the `data` folder."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"slideshow": {
"slide_type": "notes"
},
"tags": []
},
"outputs": [],
"source": [
"# import os\n",
"# base_url = \"https://zenodo.org/record/7490192/files/\"\n",
"This project follows the guidelines detailed in https://drivendata.github.io/cookiecutter-data-science/ and uses `make` to perform actions.\n",
"\n",
"If you need to download the data the first time, open a terminal and run `make data` from the `openaire` folder; `make` will take care of all the steps (i.e., downloading the relevant datasets from Zenodo, https://zenodo.org/record/7490192, untar the content and clean up). \n",
"\n",
"# items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n",
"\n",
"# for item in items: \n",
"# print(f\"Downloading {item}\")\n",
"# os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
"# print(f\"Extracting {item}\")\n",
"# os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
"At the end, the data we need will sit under the `data/raw` folder. Data in here should be immutable; let's keep it that way."
]
},
{
@ -108,7 +88,7 @@
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_json('./data/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
"df = pd.read_json('../data/raw/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
"df.head(2)"
]
},
@ -128,7 +108,7 @@
"metadata": {},
"outputs": [],
"source": [
"# files = sorted(glob.glob('./data/publication/part-*.txt.gz'))\n",
"# files = sorted(glob.glob('../data/raw/publication/part-*.txt.gz'))\n",
"# publications_df = pd.concat(pd.read_json(f, compression='gzip', lines=True) for f in files)"
]
},
@ -194,7 +174,7 @@
},
"outputs": [],
"source": [
"inputPath = 'data/'\n",
"inputPath = '../data/raw/'\n",
" \n",
"publications = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n",
"datasets = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n",
@ -222,7 +202,6 @@
"metadata": {},
"outputs": [],
"source": [
"\n",
"publications.createOrReplaceTempView(\"publications\")\n",
"datasets.createOrReplaceTempView(\"datasets\")\n",
"softwares.createOrReplaceTempView(\"software\")\n",

12
requirements.txt Normal file
View File

@ -0,0 +1,12 @@
# local package
-e .
# external requirements
click
coverage
flake8
python-dotenv>=0.5.1
pandas
igraph
matplotlib
plotly

10
setup.py Normal file
View File

@ -0,0 +1,10 @@
from setuptools import find_packages, setup
setup(
name='src',
packages=find_packages(),
version='0.1.0',
description="OpenAIRE Beginner's Kit.",
author='Andrea Mannocc, Miriam Baglioni, Sandro La Bruzzo',
license='MIT',
)

62
src/data/make_dataset.py Normal file
View File

@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
import os
from urllib.request import urlretrieve
from urllib.parse import urlsplit
import tarfile
import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
logger = logging.getLogger(__name__)
openaire_files = ["https://zenodo.org/record/7490192/files/communities_infrastructures.tar",
"https://zenodo.org/record/7490192/files/dataset.tar",
"https://zenodo.org/record/7490192/files/datasource.tar",
"https://zenodo.org/record/7490192/files/organization.tar",
"https://zenodo.org/record/7490192/files/otherresearchproduct.tar",
"https://zenodo.org/record/7490192/files/project.tar",
"https://zenodo.org/record/7490192/files/publication.tar",
"https://zenodo.org/record/7490192/files/relation.tar",
"https://zenodo.org/record/7490192/files/software.tar"]
def download_tar(url, path):
tar_name = urlsplit(url).path.split('/')[-1] # publication.tar
tar_path = os.path.join(path, tar_name) # data/raw/publication.tar
untarred_folder = tar_name.split('.')[0] # publication
untarred_path = os.path.join(path, untarred_folder) # data/raw/publication
if not os.path.exists(untarred_path):
if not os.path.exists(tar_path):
logger.info('downloading %s' % url)
urlretrieve(url, tar_path)
logger.info('untar %s' % tar_name)
with tarfile.open(tar_path, "r") as tar:
tar.extractall(path)
logger.info('cleaning')
os.remove(tar_path)
@click.command()
@click.argument('output_filepath', type=click.Path(exists=True))
def main(output_filepath):
""" Downloads data into /data/raw
"""
for tar in openaire_files:
download_tar(tar, output_filepath)
if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
# not used in this stub but often useful for finding various files
project_dir = Path(__file__).resolve().parents[2]
# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())
main()

25
test_environment.py Normal file
View File

@ -0,0 +1,25 @@
import sys
REQUIRED_PYTHON = "python3"
def main():
system_major = sys.version_info.major
if REQUIRED_PYTHON == "python":
required_major = 2
elif REQUIRED_PYTHON == "python3":
required_major = 3
else:
raise ValueError("Unrecognized python interpreter: {}".format(
REQUIRED_PYTHON))
if system_major != required_major:
raise TypeError(
"This project requires Python {}. Found: Python {}".format(
required_major, sys.version))
else:
print(">>> Development environment passes all tests!")
if __name__ == '__main__':
main()