From f2c5e262665725353a681e4a0ae914eaaa33e000 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Fri, 23 Jun 2023 17:02:01 +0200 Subject: [PATCH] using cookiecutter template --- Dockerfile | 10 ++- Makefile | 144 ++++++++++++++++++++++++++++++++++ notebooks/beginners_kit.ipynb | 33 ++------ requirements.txt | 12 +++ setup.py | 10 +++ src/data/make_dataset.py | 62 +++++++++++++++ test_environment.py | 25 ++++++ 7 files changed, 268 insertions(+), 28 deletions(-) create mode 100644 Makefile create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 src/data/make_dataset.py create mode 100644 test_environment.py diff --git a/Dockerfile b/Dockerfile index e62c411..06142e0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,15 @@ FROM jupyter/pyspark-notebook:latest RUN pip install papermill USER jovyan + RUN mkdir /home/jovyan/openaire RUN mkdir /home/jovyan/openaire/data +RUN mkdir /home/jovyan/openaire/data/raw + +ADD notebooks /home/jovyan/openaire/notebooks +ADD src /home/jovyan/openaire/src +ADD test_environment.py /home/jovyan/openaire/ +ADD setup.py /home/jovyan/openaire/ +ADD Makefile /home/jovyan/openaire +ADD requirements.txt /home/jovyan/openaire/ -ADD notebooks/beginners_kit.ipynb /home/jovyan/openaire/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6cabae1 --- /dev/null +++ b/Makefile @@ -0,0 +1,144 @@ +.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3 + +################################################################################# +# GLOBALS # +################################################################################# + +PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') +PROFILE = default +PROJECT_NAME = project_name +PYTHON_INTERPRETER = python3 + +ifeq (,$(shell which conda)) +HAS_CONDA=False +else +HAS_CONDA=True +endif + +################################################################################# +# COMMANDS # +################################################################################# + +## Install Python Dependencies +requirements: test_environment + $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel + $(PYTHON_INTERPRETER) -m pip install -r requirements.txt + +## Make Dataset +data: requirements + $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw + +## Delete all compiled Python files +clean: + find . -type f -name "*.py[co]" -delete + find . -type d -name "__pycache__" -delete + +## Lint using flake8 +lint: + flake8 src + +## Upload Data to S3 +sync_data_to_s3: +ifeq (default,$(PROFILE)) + aws s3 sync data/ s3://$(BUCKET)/data/ +else + aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE) +endif + +## Download Data from S3 +sync_data_from_s3: +ifeq (default,$(PROFILE)) + aws s3 sync s3://$(BUCKET)/data/ data/ +else + aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE) +endif + +## Set up python interpreter environment +create_environment: +ifeq (True,$(HAS_CONDA)) + @echo ">>> Detected conda, creating conda environment." +ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) + conda create --name $(PROJECT_NAME) python=3 +else + conda create --name $(PROJECT_NAME) python=2.7 +endif + @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" +else + $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper + @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ + export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" + @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" + @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" +endif + +## Test python environment is setup correctly +test_environment: + $(PYTHON_INTERPRETER) test_environment.py + +################################################################################# +# PROJECT RULES # +################################################################################# + + + +################################################################################# +# Self Documenting Commands # +################################################################################# + +.DEFAULT_GOAL := help + +# Inspired by +# sed script explained: +# /^##/: +# * save line in hold space +# * purge line +# * Loop: +# * append newline + line to hold space +# * go to next line +# * if line starts with doc comment, strip comment character off and loop +# * remove target prerequisites +# * append hold space (+ newline) to line +# * replace newline plus comments by `---` +# * print line +# Separate expressions are necessary because labels cannot be delimited by +# semicolon; see +.PHONY: help +help: + @echo "$$(tput bold)Available rules:$$(tput sgr0)" + @echo + @sed -n -e "/^## / { \ + h; \ + s/.*//; \ + :doc" \ + -e "H; \ + n; \ + s/^## //; \ + t doc" \ + -e "s/:.*//; \ + G; \ + s/\\n## /---/; \ + s/\\n/ /g; \ + p; \ + }" ${MAKEFILE_LIST} \ + | LC_ALL='C' sort --ignore-case \ + | awk -F '---' \ + -v ncol=$$(tput cols) \ + -v indent=19 \ + -v col_on="$$(tput setaf 6)" \ + -v col_off="$$(tput sgr0)" \ + '{ \ + printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ + n = split($$2, words, " "); \ + line_length = ncol - indent; \ + for (i = 1; i <= n; i++) { \ + line_length -= length(words[i]) + 1; \ + if (line_length <= 0) { \ + line_length = ncol - indent - length(words[i]) - 1; \ + printf "\n%*s ", -indent, " "; \ + } \ + printf "%s ", words[i]; \ + } \ + printf "\n"; \ + }' \ + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/notebooks/beginners_kit.ipynb b/notebooks/beginners_kit.ipynb index 13a9ec6..4dadf6d 100644 --- a/notebooks/beginners_kit.ipynb +++ b/notebooks/beginners_kit.ipynb @@ -29,31 +29,11 @@ "source": [ "This step can take some time depending on your network speed.\n", "\n", - "Uncomment and run only if you need to download the data the first time: these lines just download the datasets from the deposition on Zenodo containing data for this kit (https://zenodo.org/record/7490192), untar the content and clean up. All the data needed will sit under the `data` folder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "slideshow": { - "slide_type": "notes" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# import os\n", - "# base_url = \"https://zenodo.org/record/7490192/files/\"\n", + "This project follows the guidelines detailed in https://drivendata.github.io/cookiecutter-data-science/ and uses `make` to perform actions.\n", "\n", + "If you need to download the data the first time, open a terminal and run `make data` from the `openaire` folder; `make` will take care of all the steps (i.e., downloading the relevant datasets from Zenodo, https://zenodo.org/record/7490192, untar the content and clean up). \n", "\n", - "# items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n", - "\n", - "# for item in items: \n", - "# print(f\"Downloading {item}\")\n", - "# os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n", - "# print(f\"Extracting {item}\")\n", - "# os.system(f'tar -xf data/{item} -C data/; rm data/{item}')" + "At the end, the data we need will sit under the `data/raw` folder. Data in here should be immutable; let's keep it that way." ] }, { @@ -108,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_json('./data/publication/part-00000.txt.gz', compression='gzip', lines=True)\n", + "df = pd.read_json('../data/raw/publication/part-00000.txt.gz', compression='gzip', lines=True)\n", "df.head(2)" ] }, @@ -128,7 +108,7 @@ "metadata": {}, "outputs": [], "source": [ - "# files = sorted(glob.glob('./data/publication/part-*.txt.gz'))\n", + "# files = sorted(glob.glob('../data/raw/publication/part-*.txt.gz'))\n", "# publications_df = pd.concat(pd.read_json(f, compression='gzip', lines=True) for f in files)" ] }, @@ -194,7 +174,7 @@ }, "outputs": [], "source": [ - "inputPath = 'data/'\n", + "inputPath = '../data/raw/'\n", " \n", "publications = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n", "datasets = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n", @@ -222,7 +202,6 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "publications.createOrReplaceTempView(\"publications\")\n", "datasets.createOrReplaceTempView(\"datasets\")\n", "softwares.createOrReplaceTempView(\"software\")\n", diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a792caf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +# local package +-e . + +# external requirements +click +coverage +flake8 +python-dotenv>=0.5.1 +pandas +igraph +matplotlib +plotly \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4e1b3b8 --- /dev/null +++ b/setup.py @@ -0,0 +1,10 @@ +from setuptools import find_packages, setup + +setup( + name='src', + packages=find_packages(), + version='0.1.0', + description="OpenAIRE Beginner's Kit.", + author='Andrea Mannocc, Miriam Baglioni, Sandro La Bruzzo', + license='MIT', +) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py new file mode 100644 index 0000000..1b0efbd --- /dev/null +++ b/src/data/make_dataset.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +import os +from urllib.request import urlretrieve +from urllib.parse import urlsplit +import tarfile +import click +import logging +from pathlib import Path +from dotenv import find_dotenv, load_dotenv + +logger = logging.getLogger(__name__) + +openaire_files = ["https://zenodo.org/record/7490192/files/communities_infrastructures.tar", + "https://zenodo.org/record/7490192/files/dataset.tar", + "https://zenodo.org/record/7490192/files/datasource.tar", + "https://zenodo.org/record/7490192/files/organization.tar", + "https://zenodo.org/record/7490192/files/otherresearchproduct.tar", + "https://zenodo.org/record/7490192/files/project.tar", + "https://zenodo.org/record/7490192/files/publication.tar", + "https://zenodo.org/record/7490192/files/relation.tar", + "https://zenodo.org/record/7490192/files/software.tar"] + + +def download_tar(url, path): + tar_name = urlsplit(url).path.split('/')[-1] # publication.tar + tar_path = os.path.join(path, tar_name) # data/raw/publication.tar + untarred_folder = tar_name.split('.')[0] # publication + untarred_path = os.path.join(path, untarred_folder) # data/raw/publication + if not os.path.exists(untarred_path): + if not os.path.exists(tar_path): + logger.info('downloading %s' % url) + urlretrieve(url, tar_path) + + logger.info('untar %s' % tar_name) + with tarfile.open(tar_path, "r") as tar: + tar.extractall(path) + + logger.info('cleaning') + os.remove(tar_path) + + +@click.command() +@click.argument('output_filepath', type=click.Path(exists=True)) +def main(output_filepath): + """ Downloads data into /data/raw + """ + for tar in openaire_files: + download_tar(tar, output_filepath) + + +if __name__ == '__main__': + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + + # not used in this stub but often useful for finding various files + project_dir = Path(__file__).resolve().parents[2] + + # find .env automagically by walking up directories until it's found, then + # load up the .env entries as environment variables + load_dotenv(find_dotenv()) + + main() diff --git a/test_environment.py b/test_environment.py new file mode 100644 index 0000000..d0ac4a7 --- /dev/null +++ b/test_environment.py @@ -0,0 +1,25 @@ +import sys + +REQUIRED_PYTHON = "python3" + + +def main(): + system_major = sys.version_info.major + if REQUIRED_PYTHON == "python": + required_major = 2 + elif REQUIRED_PYTHON == "python3": + required_major = 3 + else: + raise ValueError("Unrecognized python interpreter: {}".format( + REQUIRED_PYTHON)) + + if system_major != required_major: + raise TypeError( + "This project requires Python {}. Found: Python {}".format( + required_major, sys.version)) + else: + print(">>> Development environment passes all tests!") + + +if __name__ == '__main__': + main()