From f2c5e262665725353a681e4a0ae914eaaa33e000 Mon Sep 17 00:00:00 2001
From: Andrea Mannocci <andremann@libero.it>
Date: Fri, 23 Jun 2023 17:02:01 +0200
Subject: [PATCH] using cookiecutter template

---
 Dockerfile                    |  10 ++-
 Makefile                      | 144 ++++++++++++++++++++++++++++++++++
 notebooks/beginners_kit.ipynb |  33 ++------
 requirements.txt              |  12 +++
 setup.py                      |  10 +++
 src/data/make_dataset.py      |  62 +++++++++++++++
 test_environment.py           |  25 ++++++
 7 files changed, 268 insertions(+), 28 deletions(-)
 create mode 100644 Makefile
 create mode 100644 requirements.txt
 create mode 100644 setup.py
 create mode 100644 src/data/make_dataset.py
 create mode 100644 test_environment.py

diff --git a/Dockerfile b/Dockerfile
index e62c411..06142e0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,15 @@ FROM jupyter/pyspark-notebook:latest
 
 RUN pip install papermill
 USER jovyan
+
 RUN mkdir /home/jovyan/openaire
 RUN mkdir /home/jovyan/openaire/data
+RUN mkdir /home/jovyan/openaire/data/raw
+
+ADD notebooks /home/jovyan/openaire/notebooks
+ADD src /home/jovyan/openaire/src
+ADD test_environment.py /home/jovyan/openaire/
+ADD setup.py /home/jovyan/openaire/
+ADD Makefile /home/jovyan/openaire
+ADD requirements.txt /home/jovyan/openaire/
 
-ADD notebooks/beginners_kit.ipynb  /home/jovyan/openaire/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..6cabae1
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,144 @@
+.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
+
+#################################################################################
+# GLOBALS                                                                       #
+#################################################################################
+
+PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
+PROFILE = default
+PROJECT_NAME = project_name
+PYTHON_INTERPRETER = python3
+
+ifeq (,$(shell which conda))
+HAS_CONDA=False
+else
+HAS_CONDA=True
+endif
+
+#################################################################################
+# COMMANDS                                                                      #
+#################################################################################
+
+## Install Python Dependencies
+requirements: test_environment
+	$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
+	$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
+
+## Make Dataset
+data: requirements
+	$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw
+
+## Delete all compiled Python files
+clean:
+	find . -type f -name "*.py[co]" -delete
+	find . -type d -name "__pycache__" -delete
+
+## Lint using flake8
+lint:
+	flake8 src
+
+## Upload Data to S3
+sync_data_to_s3:
+ifeq (default,$(PROFILE))
+	aws s3 sync data/ s3://$(BUCKET)/data/
+else
+	aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
+endif
+
+## Download Data from S3
+sync_data_from_s3:
+ifeq (default,$(PROFILE))
+	aws s3 sync s3://$(BUCKET)/data/ data/
+else
+	aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
+endif
+
+## Set up python interpreter environment
+create_environment:
+ifeq (True,$(HAS_CONDA))
+		@echo ">>> Detected conda, creating conda environment."
+ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
+	conda create --name $(PROJECT_NAME) python=3
+else
+	conda create --name $(PROJECT_NAME) python=2.7
+endif
+		@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
+else
+	$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
+	@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
+	export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
+	@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
+	@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
+endif
+
+## Test python environment is setup correctly
+test_environment:
+	$(PYTHON_INTERPRETER) test_environment.py
+
+#################################################################################
+# PROJECT RULES                                                                 #
+#################################################################################
+
+
+
+#################################################################################
+# Self Documenting Commands                                                     #
+#################################################################################
+
+.DEFAULT_GOAL := help
+
+# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
+# sed script explained:
+# /^##/:
+# 	* save line in hold space
+# 	* purge line
+# 	* Loop:
+# 		* append newline + line to hold space
+# 		* go to next line
+# 		* if line starts with doc comment, strip comment character off and loop
+# 	* remove target prerequisites
+# 	* append hold space (+ newline) to line
+# 	* replace newline plus comments by `---`
+# 	* print line
+# Separate expressions are necessary because labels cannot be delimited by
+# semicolon; see <http://stackoverflow.com/a/11799865/1968>
+.PHONY: help
+help:
+	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
+	@echo
+	@sed -n -e "/^## / { \
+		h; \
+		s/.*//; \
+		:doc" \
+		-e "H; \
+		n; \
+		s/^## //; \
+		t doc" \
+		-e "s/:.*//; \
+		G; \
+		s/\\n## /---/; \
+		s/\\n/ /g; \
+		p; \
+	}" ${MAKEFILE_LIST} \
+	| LC_ALL='C' sort --ignore-case \
+	| awk -F '---' \
+		-v ncol=$$(tput cols) \
+		-v indent=19 \
+		-v col_on="$$(tput setaf 6)" \
+		-v col_off="$$(tput sgr0)" \
+	'{ \
+		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
+		n = split($$2, words, " "); \
+		line_length = ncol - indent; \
+		for (i = 1; i <= n; i++) { \
+			line_length -= length(words[i]) + 1; \
+			if (line_length <= 0) { \
+				line_length = ncol - indent - length(words[i]) - 1; \
+				printf "\n%*s ", -indent, " "; \
+			} \
+			printf "%s ", words[i]; \
+		} \
+		printf "\n"; \
+	}' \
+	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
diff --git a/notebooks/beginners_kit.ipynb b/notebooks/beginners_kit.ipynb
index 13a9ec6..4dadf6d 100644
--- a/notebooks/beginners_kit.ipynb
+++ b/notebooks/beginners_kit.ipynb
@@ -29,31 +29,11 @@
    "source": [
     "This step can take some time depending on your network speed.\n",
     "\n",
-    "Uncomment and run only if you need to download the data the first time: these lines just download the datasets from the deposition on Zenodo containing data for this kit (https://zenodo.org/record/7490192), untar the content and clean up. All the data needed will sit under the `data` folder."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "slideshow": {
-     "slide_type": "notes"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# import os\n",
-    "# base_url = \"https://zenodo.org/record/7490192/files/\"\n",
+    "This project follows the guidelines detailed in https://drivendata.github.io/cookiecutter-data-science/ and uses `make` to perform actions.\n",
     "\n",
+    "If you need to download the data the first time, open a terminal and run `make data` from the `openaire` folder; `make` will take care of all the steps (i.e., downloading the relevant datasets from Zenodo, https://zenodo.org/record/7490192, untar the content and clean up). \n",
     "\n",
-    "# items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n",
-    "\n",
-    "# for item in items:    \n",
-    "#     print(f\"Downloading {item}\")\n",
-    "#     os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
-    "#     print(f\"Extracting {item}\")\n",
-    "#     os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
+    "At the end, the data we need will sit under the `data/raw` folder. Data in here should be immutable; let's keep it that way."
    ]
   },
   {
@@ -108,7 +88,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.read_json('./data/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
+    "df = pd.read_json('../data/raw/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
     "df.head(2)"
    ]
   },
@@ -128,7 +108,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# files = sorted(glob.glob('./data/publication/part-*.txt.gz'))\n",
+    "# files = sorted(glob.glob('../data/raw/publication/part-*.txt.gz'))\n",
     "# publications_df = pd.concat(pd.read_json(f, compression='gzip', lines=True) for f in files)"
    ]
   },
@@ -194,7 +174,7 @@
    },
    "outputs": [],
    "source": [
-    "inputPath = 'data/'\n",
+    "inputPath = '../data/raw/'\n",
     " \n",
     "publications = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n",
     "datasets = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n",
@@ -222,7 +202,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "publications.createOrReplaceTempView(\"publications\")\n",
     "datasets.createOrReplaceTempView(\"datasets\")\n",
     "softwares.createOrReplaceTempView(\"software\")\n",
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a792caf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+# local package
+-e .
+
+# external requirements
+click
+coverage
+flake8
+python-dotenv>=0.5.1
+pandas
+igraph
+matplotlib
+plotly
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..4e1b3b8
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,10 @@
+from setuptools import find_packages, setup
+
+setup(
+    name='src',
+    packages=find_packages(),
+    version='0.1.0',
+    description="OpenAIRE Beginner's Kit.",
+    author='Andrea Mannocc, Miriam Baglioni, Sandro La Bruzzo',
+    license='MIT',
+)
diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py
new file mode 100644
index 0000000..1b0efbd
--- /dev/null
+++ b/src/data/make_dataset.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+import os
+from urllib.request import urlretrieve
+from urllib.parse import urlsplit
+import tarfile
+import click
+import logging
+from pathlib import Path
+from dotenv import find_dotenv, load_dotenv
+
+logger = logging.getLogger(__name__)
+
+openaire_files = ["https://zenodo.org/record/7490192/files/communities_infrastructures.tar",
+                  "https://zenodo.org/record/7490192/files/dataset.tar",
+                  "https://zenodo.org/record/7490192/files/datasource.tar",
+                  "https://zenodo.org/record/7490192/files/organization.tar",
+                  "https://zenodo.org/record/7490192/files/otherresearchproduct.tar",
+                  "https://zenodo.org/record/7490192/files/project.tar",
+                  "https://zenodo.org/record/7490192/files/publication.tar",
+                  "https://zenodo.org/record/7490192/files/relation.tar",
+                  "https://zenodo.org/record/7490192/files/software.tar"]
+
+
+def download_tar(url, path):
+    tar_name = urlsplit(url).path.split('/')[-1] # publication.tar
+    tar_path = os.path.join(path, tar_name) # data/raw/publication.tar
+    untarred_folder = tar_name.split('.')[0] # publication
+    untarred_path = os.path.join(path, untarred_folder) # data/raw/publication
+    if not os.path.exists(untarred_path):
+        if not os.path.exists(tar_path):
+            logger.info('downloading %s' % url)
+            urlretrieve(url, tar_path)
+
+        logger.info('untar %s' % tar_name)
+        with tarfile.open(tar_path, "r") as tar:
+            tar.extractall(path)
+
+        logger.info('cleaning')
+        os.remove(tar_path)
+
+
+@click.command()
+@click.argument('output_filepath', type=click.Path(exists=True))
+def main(output_filepath):
+    """ Downloads data into /data/raw
+    """
+    for tar in openaire_files:
+        download_tar(tar, output_filepath)
+
+
+if __name__ == '__main__':
+    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+
+    # not used in this stub but often useful for finding various files
+    project_dir = Path(__file__).resolve().parents[2]
+
+    # find .env automagically by walking up directories until it's found, then
+    # load up the .env entries as environment variables
+    load_dotenv(find_dotenv())
+
+    main()
diff --git a/test_environment.py b/test_environment.py
new file mode 100644
index 0000000..d0ac4a7
--- /dev/null
+++ b/test_environment.py
@@ -0,0 +1,25 @@
+import sys
+
+REQUIRED_PYTHON = "python3"
+
+
+def main():
+    system_major = sys.version_info.major
+    if REQUIRED_PYTHON == "python":
+        required_major = 2
+    elif REQUIRED_PYTHON == "python3":
+        required_major = 3
+    else:
+        raise ValueError("Unrecognized python interpreter: {}".format(
+            REQUIRED_PYTHON))
+
+    if system_major != required_major:
+        raise TypeError(
+            "This project requires Python {}. Found: Python {}".format(
+                required_major, sys.version))
+    else:
+        print(">>> Development environment passes all tests!")
+
+
+if __name__ == '__main__':
+    main()