using cookiecutter template

2023-06-23 17:02:01 +02:00 · 2023-06-23 17:02:01 +02:00 · f2c5e26266
parent d70036c5fe
commit f2c5e26266
7 changed files with 268 additions and 28 deletions
--- a/10
+++ b/10
@ -2,7 +2,15 @@ FROM jupyter/pyspark-notebook:latest
 RUN pip install papermill
 USER jovyan
 RUN mkdir /home/jovyan/openaire
 RUN mkdir /home/jovyan/openaire/data
 RUN mkdir /home/jovyan/openaire/data/raw
 ADD notebooks /home/jovyan/openaire/notebooks
 ADD src /home/jovyan/openaire/src
 ADD test_environment.py /home/jovyan/openaire/
 ADD setup.py /home/jovyan/openaire/
 ADD Makefile /home/jovyan/openaire
 ADD requirements.txt /home/jovyan/openaire/
 ADD notebooks/beginners_kit.ipynb  /home/jovyan/openaire/
--- a/144
+++ b/144
@ -0,0 +1,144 @@
 .PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
 #################################################################################
 # GLOBALS                                                                       #
 #################################################################################
 PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
 PROFILE = default
 PROJECT_NAME = project_name
 PYTHON_INTERPRETER = python3
 ifeq (,$(shell which conda))
 HAS_CONDA=False
 else
 HAS_CONDA=True
 endif
 #################################################################################
 # COMMANDS                                                                      #
 #################################################################################
 ## Install Python Dependencies
 requirements: test_environment
 	$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
 	$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
 ## Make Dataset
 data: requirements
 	$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw
 ## Delete all compiled Python files
 clean:
 	find . -type f -name "*.py[co]" -delete
 	find . -type d -name "__pycache__" -delete
 ## Lint using flake8
 lint:
 	flake8 src
 ## Upload Data to S3
 sync_data_to_s3:
 ifeq (default,$(PROFILE))
 	aws s3 sync data/ s3://$(BUCKET)/data/
 else
 	aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
 endif
 ## Download Data from S3
 sync_data_from_s3:
 ifeq (default,$(PROFILE))
 	aws s3 sync s3://$(BUCKET)/data/ data/
 else
 	aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
 endif
 ## Set up python interpreter environment
 create_environment:
 ifeq (True,$(HAS_CONDA))
 		@echo ">>> Detected conda, creating conda environment."
 ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
 	conda create --name $(PROJECT_NAME) python=3
 else
 	conda create --name $(PROJECT_NAME) python=2.7
 endif
 		@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
 else
 	$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
 	@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
 	export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
 	@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
 	@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
 endif
 ## Test python environment is setup correctly
 test_environment:
 	$(PYTHON_INTERPRETER) test_environment.py
 #################################################################################
 # PROJECT RULES                                                                 #
 #################################################################################
 #################################################################################
 # Self Documenting Commands                                                     #
 #################################################################################
 .DEFAULT_GOAL := help
 # Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
 # sed script explained:
 # /^##/:
 # 	* save line in hold space
 # 	* purge line
 # 	* Loop:
 # 		* append newline + line to hold space
 # 		* go to next line
 # 		* if line starts with doc comment, strip comment character off and loop
 # 	* remove target prerequisites
 # 	* append hold space (+ newline) to line
 # 	* replace newline plus comments by `---`
 # 	* print line
 # Separate expressions are necessary because labels cannot be delimited by
 # semicolon; see <http://stackoverflow.com/a/11799865/1968>
 .PHONY: help
 help:
 	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
 	@echo
 	@sed -n -e "/^## / { \
 		h; \
 		s/.*//; \
 		:doc" \
 		-e "H; \
 		n; \
 		s/^## //; \
 		t doc" \
 		-e "s/:.*//; \
 		G; \
 		s/\\n## /---/; \
 		s/\\n/ /g; \
 		p; \
 	}" ${MAKEFILE_LIST} \
 	| LC_ALL='C' sort --ignore-case \
 	| awk -F '---' \
 		-v ncol=$$(tput cols) \
 		-v indent=19 \
 		-v col_on="$$(tput setaf 6)" \
 		-v col_off="$$(tput sgr0)" \
 	'{ \
 		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
 		n = split($$2, words, " "); \
 		line_length = ncol - indent; \
 		for (i = 1; i <= n; i++) { \
 			line_length -= length(words[i]) + 1; \
 			if (line_length <= 0) { \
 				line_length = ncol - indent - length(words[i]) - 1; \
 				printf "\n%*s ", -indent, " "; \
 			} \
 			printf "%s ", words[i]; \
 		} \
 		printf "\n"; \
 	}' \
 	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
--- a/notebooks/beginners_kit.ipynb
+++ b/notebooks/beginners_kit.ipynb
@ -29,31 +29,11 @@
   "source": [
    "This step can take some time depending on your network speed.\n",
    "\n",
-    "Uncomment and run only if you need to download the data the first time: these lines just download the datasets from the deposition on Zenodo containing data for this kit (https://zenodo.org/record/7490192), untar the content and clean up. All the data needed will sit under the `data` folder."
+    "This project follows the guidelines detailed in https://drivendata.github.io/cookiecutter-data-science/ and uses `make` to perform actions.\n",
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "slideshow": {
     "slide_type": "notes"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# import os\n",
    "# base_url = \"https://zenodo.org/record/7490192/files/\"\n",
    "\n",
    "If you need to download the data the first time, open a terminal and run `make data` from the `openaire` folder; `make` will take care of all the steps (i.e., downloading the relevant datasets from Zenodo, https://zenodo.org/record/7490192, untar the content and clean up). \n",
    "\n",
-    "# items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n",
+    "At the end, the data we need will sit under the `data/raw` folder. Data in here should be immutable; let's keep it that way."
    "\n",
    "# for item in items:    \n",
    "#     print(f\"Downloading {item}\")\n",
    "#     os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
    "#     print(f\"Extracting {item}\")\n",
    "#     os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
   ]
  },
  {
@ -108,7 +88,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "df = pd.read_json('./data/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
+    "df = pd.read_json('../data/raw/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
    "df.head(2)"
   ]
  },
@ -128,7 +108,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# files = sorted(glob.glob('./data/publication/part-*.txt.gz'))\n",
+    "# files = sorted(glob.glob('../data/raw/publication/part-*.txt.gz'))\n",
    "# publications_df = pd.concat(pd.read_json(f, compression='gzip', lines=True) for f in files)"
   ]
  },
@ -194,7 +174,7 @@
   },
   "outputs": [],
   "source": [
-    "inputPath = 'data/'\n",
+    "inputPath = '../data/raw/'\n",
    " \n",
    "publications = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n",
    "datasets = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n",
@ -222,7 +202,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "publications.createOrReplaceTempView(\"publications\")\n",
    "datasets.createOrReplaceTempView(\"datasets\")\n",
    "softwares.createOrReplaceTempView(\"software\")\n",
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,12 @@
 # local package
 -e .
 # external requirements
 click
 coverage
 flake8
 python-dotenv>=0.5.1
 pandas
 igraph
 matplotlib
 plotly
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,10 @@
 from setuptools import find_packages, setup
 setup(
    name='src',
    packages=find_packages(),
    version='0.1.0',
    description="OpenAIRE Beginner's Kit.",
    author='Andrea Mannocc, Miriam Baglioni, Sandro La Bruzzo',
    license='MIT',
 )
--- a/src/data/make_dataset.py
+++ b/src/data/make_dataset.py
@ -0,0 +1,62 @@
 # -*- coding: utf-8 -*-
 import os
 from urllib.request import urlretrieve
 from urllib.parse import urlsplit
 import tarfile
 import click
 import logging
 from pathlib import Path
 from dotenv import find_dotenv, load_dotenv
 logger = logging.getLogger(__name__)
 openaire_files = ["https://zenodo.org/record/7490192/files/communities_infrastructures.tar",
                  "https://zenodo.org/record/7490192/files/dataset.tar",
                  "https://zenodo.org/record/7490192/files/datasource.tar",
                  "https://zenodo.org/record/7490192/files/organization.tar",
                  "https://zenodo.org/record/7490192/files/otherresearchproduct.tar",
                  "https://zenodo.org/record/7490192/files/project.tar",
                  "https://zenodo.org/record/7490192/files/publication.tar",
                  "https://zenodo.org/record/7490192/files/relation.tar",
                  "https://zenodo.org/record/7490192/files/software.tar"]
 def download_tar(url, path):
    tar_name = urlsplit(url).path.split('/')[-1] # publication.tar
    tar_path = os.path.join(path, tar_name) # data/raw/publication.tar
    untarred_folder = tar_name.split('.')[0] # publication
    untarred_path = os.path.join(path, untarred_folder) # data/raw/publication
    if not os.path.exists(untarred_path):
        if not os.path.exists(tar_path):
            logger.info('downloading %s' % url)
            urlretrieve(url, tar_path)
        logger.info('untar %s' % tar_name)
        with tarfile.open(tar_path, "r") as tar:
            tar.extractall(path)
        logger.info('cleaning')
        os.remove(tar_path)
@click.command()
@click.argument('output_filepath', type=click.Path(exists=True))
 def main(output_filepath):
    """ Downloads data into /data/raw
    """
    for tar in openaire_files:
        download_tar(tar, output_filepath)
 if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)
    # not used in this stub but often useful for finding various files
    project_dir = Path(__file__).resolve().parents[2]
    # find .env automagically by walking up directories until it's found, then
    # load up the .env entries as environment variables
    load_dotenv(find_dotenv())
    main()
--- a/test_environment.py
+++ b/test_environment.py
@ -0,0 +1,25 @@
 import sys
 REQUIRED_PYTHON = "python3"
 def main():
    system_major = sys.version_info.major
    if REQUIRED_PYTHON == "python":
        required_major = 2
    elif REQUIRED_PYTHON == "python3":
        required_major = 3
    else:
        raise ValueError("Unrecognized python interpreter: {}".format(
            REQUIRED_PYTHON))
    if system_major != required_major:
        raise TypeError(
            "This project requires Python {}. Found: Python {}".format(
                required_major, sys.version))
    else:
        print(">>> Development environment passes all tests!")
 if __name__ == '__main__':
    main()