using cookiecutter template
This commit is contained in:
parent
d70036c5fe
commit
f2c5e26266
10
Dockerfile
10
Dockerfile
|
@ -2,7 +2,15 @@ FROM jupyter/pyspark-notebook:latest
|
|||
|
||||
RUN pip install papermill
|
||||
USER jovyan
|
||||
|
||||
RUN mkdir /home/jovyan/openaire
|
||||
RUN mkdir /home/jovyan/openaire/data
|
||||
RUN mkdir /home/jovyan/openaire/data/raw
|
||||
|
||||
ADD notebooks /home/jovyan/openaire/notebooks
|
||||
ADD src /home/jovyan/openaire/src
|
||||
ADD test_environment.py /home/jovyan/openaire/
|
||||
ADD setup.py /home/jovyan/openaire/
|
||||
ADD Makefile /home/jovyan/openaire
|
||||
ADD requirements.txt /home/jovyan/openaire/
|
||||
|
||||
ADD notebooks/beginners_kit.ipynb /home/jovyan/openaire/
|
||||
|
|
|
@ -0,0 +1,144 @@
|
|||
.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
|
||||
|
||||
#################################################################################
|
||||
# GLOBALS #
|
||||
#################################################################################
|
||||
|
||||
PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
|
||||
BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
|
||||
PROFILE = default
|
||||
PROJECT_NAME = project_name
|
||||
PYTHON_INTERPRETER = python3
|
||||
|
||||
ifeq (,$(shell which conda))
|
||||
HAS_CONDA=False
|
||||
else
|
||||
HAS_CONDA=True
|
||||
endif
|
||||
|
||||
#################################################################################
|
||||
# COMMANDS #
|
||||
#################################################################################
|
||||
|
||||
## Install Python Dependencies
|
||||
requirements: test_environment
|
||||
$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
|
||||
$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
|
||||
|
||||
## Make Dataset
|
||||
data: requirements
|
||||
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw
|
||||
|
||||
## Delete all compiled Python files
|
||||
clean:
|
||||
find . -type f -name "*.py[co]" -delete
|
||||
find . -type d -name "__pycache__" -delete
|
||||
|
||||
## Lint using flake8
|
||||
lint:
|
||||
flake8 src
|
||||
|
||||
## Upload Data to S3
|
||||
sync_data_to_s3:
|
||||
ifeq (default,$(PROFILE))
|
||||
aws s3 sync data/ s3://$(BUCKET)/data/
|
||||
else
|
||||
aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
|
||||
endif
|
||||
|
||||
## Download Data from S3
|
||||
sync_data_from_s3:
|
||||
ifeq (default,$(PROFILE))
|
||||
aws s3 sync s3://$(BUCKET)/data/ data/
|
||||
else
|
||||
aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
|
||||
endif
|
||||
|
||||
## Set up python interpreter environment
|
||||
create_environment:
|
||||
ifeq (True,$(HAS_CONDA))
|
||||
@echo ">>> Detected conda, creating conda environment."
|
||||
ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
|
||||
conda create --name $(PROJECT_NAME) python=3
|
||||
else
|
||||
conda create --name $(PROJECT_NAME) python=2.7
|
||||
endif
|
||||
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
|
||||
else
|
||||
$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
|
||||
@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
|
||||
export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
|
||||
@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
|
||||
@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
|
||||
endif
|
||||
|
||||
## Test python environment is setup correctly
|
||||
test_environment:
|
||||
$(PYTHON_INTERPRETER) test_environment.py
|
||||
|
||||
#################################################################################
|
||||
# PROJECT RULES #
|
||||
#################################################################################
|
||||
|
||||
|
||||
|
||||
#################################################################################
|
||||
# Self Documenting Commands #
|
||||
#################################################################################
|
||||
|
||||
.DEFAULT_GOAL := help
|
||||
|
||||
# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
|
||||
# sed script explained:
|
||||
# /^##/:
|
||||
# * save line in hold space
|
||||
# * purge line
|
||||
# * Loop:
|
||||
# * append newline + line to hold space
|
||||
# * go to next line
|
||||
# * if line starts with doc comment, strip comment character off and loop
|
||||
# * remove target prerequisites
|
||||
# * append hold space (+ newline) to line
|
||||
# * replace newline plus comments by `---`
|
||||
# * print line
|
||||
# Separate expressions are necessary because labels cannot be delimited by
|
||||
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
|
||||
.PHONY: help
|
||||
help:
|
||||
@echo "$$(tput bold)Available rules:$$(tput sgr0)"
|
||||
@echo
|
||||
@sed -n -e "/^## / { \
|
||||
h; \
|
||||
s/.*//; \
|
||||
:doc" \
|
||||
-e "H; \
|
||||
n; \
|
||||
s/^## //; \
|
||||
t doc" \
|
||||
-e "s/:.*//; \
|
||||
G; \
|
||||
s/\\n## /---/; \
|
||||
s/\\n/ /g; \
|
||||
p; \
|
||||
}" ${MAKEFILE_LIST} \
|
||||
| LC_ALL='C' sort --ignore-case \
|
||||
| awk -F '---' \
|
||||
-v ncol=$$(tput cols) \
|
||||
-v indent=19 \
|
||||
-v col_on="$$(tput setaf 6)" \
|
||||
-v col_off="$$(tput sgr0)" \
|
||||
'{ \
|
||||
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
|
||||
n = split($$2, words, " "); \
|
||||
line_length = ncol - indent; \
|
||||
for (i = 1; i <= n; i++) { \
|
||||
line_length -= length(words[i]) + 1; \
|
||||
if (line_length <= 0) { \
|
||||
line_length = ncol - indent - length(words[i]) - 1; \
|
||||
printf "\n%*s ", -indent, " "; \
|
||||
} \
|
||||
printf "%s ", words[i]; \
|
||||
} \
|
||||
printf "\n"; \
|
||||
}' \
|
||||
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
|
|
@ -29,31 +29,11 @@
|
|||
"source": [
|
||||
"This step can take some time depending on your network speed.\n",
|
||||
"\n",
|
||||
"Uncomment and run only if you need to download the data the first time: these lines just download the datasets from the deposition on Zenodo containing data for this kit (https://zenodo.org/record/7490192), untar the content and clean up. All the data needed will sit under the `data` folder."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import os\n",
|
||||
"# base_url = \"https://zenodo.org/record/7490192/files/\"\n",
|
||||
"This project follows the guidelines detailed in https://drivendata.github.io/cookiecutter-data-science/ and uses `make` to perform actions.\n",
|
||||
"\n",
|
||||
"If you need to download the data the first time, open a terminal and run `make data` from the `openaire` folder; `make` will take care of all the steps (i.e., downloading the relevant datasets from Zenodo, https://zenodo.org/record/7490192, untar the content and clean up). \n",
|
||||
"\n",
|
||||
"# items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n",
|
||||
"\n",
|
||||
"# for item in items: \n",
|
||||
"# print(f\"Downloading {item}\")\n",
|
||||
"# os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
|
||||
"# print(f\"Extracting {item}\")\n",
|
||||
"# os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
|
||||
"At the end, the data we need will sit under the `data/raw` folder. Data in here should be immutable; let's keep it that way."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -108,7 +88,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_json('./data/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
|
||||
"df = pd.read_json('../data/raw/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
|
||||
"df.head(2)"
|
||||
]
|
||||
},
|
||||
|
@ -128,7 +108,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# files = sorted(glob.glob('./data/publication/part-*.txt.gz'))\n",
|
||||
"# files = sorted(glob.glob('../data/raw/publication/part-*.txt.gz'))\n",
|
||||
"# publications_df = pd.concat(pd.read_json(f, compression='gzip', lines=True) for f in files)"
|
||||
]
|
||||
},
|
||||
|
@ -194,7 +174,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"inputPath = 'data/'\n",
|
||||
"inputPath = '../data/raw/'\n",
|
||||
" \n",
|
||||
"publications = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n",
|
||||
"datasets = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n",
|
||||
|
@ -222,7 +202,6 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"publications.createOrReplaceTempView(\"publications\")\n",
|
||||
"datasets.createOrReplaceTempView(\"datasets\")\n",
|
||||
"softwares.createOrReplaceTempView(\"software\")\n",
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
# local package
|
||||
-e .
|
||||
|
||||
# external requirements
|
||||
click
|
||||
coverage
|
||||
flake8
|
||||
python-dotenv>=0.5.1
|
||||
pandas
|
||||
igraph
|
||||
matplotlib
|
||||
plotly
|
|
@ -0,0 +1,10 @@
|
|||
from setuptools import find_packages, setup
|
||||
|
||||
setup(
|
||||
name='src',
|
||||
packages=find_packages(),
|
||||
version='0.1.0',
|
||||
description="OpenAIRE Beginner's Kit.",
|
||||
author='Andrea Mannocc, Miriam Baglioni, Sandro La Bruzzo',
|
||||
license='MIT',
|
||||
)
|
|
@ -0,0 +1,62 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
from urllib.request import urlretrieve
|
||||
from urllib.parse import urlsplit
|
||||
import tarfile
|
||||
import click
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from dotenv import find_dotenv, load_dotenv
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
openaire_files = ["https://zenodo.org/record/7490192/files/communities_infrastructures.tar",
|
||||
"https://zenodo.org/record/7490192/files/dataset.tar",
|
||||
"https://zenodo.org/record/7490192/files/datasource.tar",
|
||||
"https://zenodo.org/record/7490192/files/organization.tar",
|
||||
"https://zenodo.org/record/7490192/files/otherresearchproduct.tar",
|
||||
"https://zenodo.org/record/7490192/files/project.tar",
|
||||
"https://zenodo.org/record/7490192/files/publication.tar",
|
||||
"https://zenodo.org/record/7490192/files/relation.tar",
|
||||
"https://zenodo.org/record/7490192/files/software.tar"]
|
||||
|
||||
|
||||
def download_tar(url, path):
|
||||
tar_name = urlsplit(url).path.split('/')[-1] # publication.tar
|
||||
tar_path = os.path.join(path, tar_name) # data/raw/publication.tar
|
||||
untarred_folder = tar_name.split('.')[0] # publication
|
||||
untarred_path = os.path.join(path, untarred_folder) # data/raw/publication
|
||||
if not os.path.exists(untarred_path):
|
||||
if not os.path.exists(tar_path):
|
||||
logger.info('downloading %s' % url)
|
||||
urlretrieve(url, tar_path)
|
||||
|
||||
logger.info('untar %s' % tar_name)
|
||||
with tarfile.open(tar_path, "r") as tar:
|
||||
tar.extractall(path)
|
||||
|
||||
logger.info('cleaning')
|
||||
os.remove(tar_path)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('output_filepath', type=click.Path(exists=True))
|
||||
def main(output_filepath):
|
||||
""" Downloads data into /data/raw
|
||||
"""
|
||||
for tar in openaire_files:
|
||||
download_tar(tar, output_filepath)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
logging.basicConfig(level=logging.INFO, format=log_fmt)
|
||||
|
||||
# not used in this stub but often useful for finding various files
|
||||
project_dir = Path(__file__).resolve().parents[2]
|
||||
|
||||
# find .env automagically by walking up directories until it's found, then
|
||||
# load up the .env entries as environment variables
|
||||
load_dotenv(find_dotenv())
|
||||
|
||||
main()
|
|
@ -0,0 +1,25 @@
|
|||
import sys
|
||||
|
||||
REQUIRED_PYTHON = "python3"
|
||||
|
||||
|
||||
def main():
|
||||
system_major = sys.version_info.major
|
||||
if REQUIRED_PYTHON == "python":
|
||||
required_major = 2
|
||||
elif REQUIRED_PYTHON == "python3":
|
||||
required_major = 3
|
||||
else:
|
||||
raise ValueError("Unrecognized python interpreter: {}".format(
|
||||
REQUIRED_PYTHON))
|
||||
|
||||
if system_major != required_major:
|
||||
raise TypeError(
|
||||
"This project requires Python {}. Found: Python {}".format(
|
||||
required_major, sys.version))
|
||||
else:
|
||||
print(">>> Development environment passes all tests!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue