using cookiecutter template
This commit is contained in:
parent
d70036c5fe
commit
f2c5e26266
10
Dockerfile
10
Dockerfile
|
@ -2,7 +2,15 @@ FROM jupyter/pyspark-notebook:latest
|
||||||
|
|
||||||
RUN pip install papermill
|
RUN pip install papermill
|
||||||
USER jovyan
|
USER jovyan
|
||||||
|
|
||||||
RUN mkdir /home/jovyan/openaire
|
RUN mkdir /home/jovyan/openaire
|
||||||
RUN mkdir /home/jovyan/openaire/data
|
RUN mkdir /home/jovyan/openaire/data
|
||||||
|
RUN mkdir /home/jovyan/openaire/data/raw
|
||||||
|
|
||||||
|
ADD notebooks /home/jovyan/openaire/notebooks
|
||||||
|
ADD src /home/jovyan/openaire/src
|
||||||
|
ADD test_environment.py /home/jovyan/openaire/
|
||||||
|
ADD setup.py /home/jovyan/openaire/
|
||||||
|
ADD Makefile /home/jovyan/openaire
|
||||||
|
ADD requirements.txt /home/jovyan/openaire/
|
||||||
|
|
||||||
ADD notebooks/beginners_kit.ipynb /home/jovyan/openaire/
|
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
|
||||||
|
|
||||||
|
#################################################################################
|
||||||
|
# GLOBALS #
|
||||||
|
#################################################################################
|
||||||
|
|
||||||
|
PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
|
||||||
|
BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
|
||||||
|
PROFILE = default
|
||||||
|
PROJECT_NAME = project_name
|
||||||
|
PYTHON_INTERPRETER = python3
|
||||||
|
|
||||||
|
ifeq (,$(shell which conda))
|
||||||
|
HAS_CONDA=False
|
||||||
|
else
|
||||||
|
HAS_CONDA=True
|
||||||
|
endif
|
||||||
|
|
||||||
|
#################################################################################
|
||||||
|
# COMMANDS #
|
||||||
|
#################################################################################
|
||||||
|
|
||||||
|
## Install Python Dependencies
|
||||||
|
requirements: test_environment
|
||||||
|
$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
|
||||||
|
$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
## Make Dataset
|
||||||
|
data: requirements
|
||||||
|
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw
|
||||||
|
|
||||||
|
## Delete all compiled Python files
|
||||||
|
clean:
|
||||||
|
find . -type f -name "*.py[co]" -delete
|
||||||
|
find . -type d -name "__pycache__" -delete
|
||||||
|
|
||||||
|
## Lint using flake8
|
||||||
|
lint:
|
||||||
|
flake8 src
|
||||||
|
|
||||||
|
## Upload Data to S3
|
||||||
|
sync_data_to_s3:
|
||||||
|
ifeq (default,$(PROFILE))
|
||||||
|
aws s3 sync data/ s3://$(BUCKET)/data/
|
||||||
|
else
|
||||||
|
aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
|
||||||
|
endif
|
||||||
|
|
||||||
|
## Download Data from S3
|
||||||
|
sync_data_from_s3:
|
||||||
|
ifeq (default,$(PROFILE))
|
||||||
|
aws s3 sync s3://$(BUCKET)/data/ data/
|
||||||
|
else
|
||||||
|
aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
|
||||||
|
endif
|
||||||
|
|
||||||
|
## Set up python interpreter environment
|
||||||
|
create_environment:
|
||||||
|
ifeq (True,$(HAS_CONDA))
|
||||||
|
@echo ">>> Detected conda, creating conda environment."
|
||||||
|
ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
|
||||||
|
conda create --name $(PROJECT_NAME) python=3
|
||||||
|
else
|
||||||
|
conda create --name $(PROJECT_NAME) python=2.7
|
||||||
|
endif
|
||||||
|
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
|
||||||
|
else
|
||||||
|
$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
|
||||||
|
@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
|
||||||
|
export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
|
||||||
|
@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
|
||||||
|
@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
|
||||||
|
endif
|
||||||
|
|
||||||
|
## Test python environment is setup correctly
|
||||||
|
test_environment:
|
||||||
|
$(PYTHON_INTERPRETER) test_environment.py
|
||||||
|
|
||||||
|
#################################################################################
|
||||||
|
# PROJECT RULES #
|
||||||
|
#################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#################################################################################
|
||||||
|
# Self Documenting Commands #
|
||||||
|
#################################################################################
|
||||||
|
|
||||||
|
.DEFAULT_GOAL := help
|
||||||
|
|
||||||
|
# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
|
||||||
|
# sed script explained:
|
||||||
|
# /^##/:
|
||||||
|
# * save line in hold space
|
||||||
|
# * purge line
|
||||||
|
# * Loop:
|
||||||
|
# * append newline + line to hold space
|
||||||
|
# * go to next line
|
||||||
|
# * if line starts with doc comment, strip comment character off and loop
|
||||||
|
# * remove target prerequisites
|
||||||
|
# * append hold space (+ newline) to line
|
||||||
|
# * replace newline plus comments by `---`
|
||||||
|
# * print line
|
||||||
|
# Separate expressions are necessary because labels cannot be delimited by
|
||||||
|
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
|
||||||
|
.PHONY: help
|
||||||
|
help:
|
||||||
|
@echo "$$(tput bold)Available rules:$$(tput sgr0)"
|
||||||
|
@echo
|
||||||
|
@sed -n -e "/^## / { \
|
||||||
|
h; \
|
||||||
|
s/.*//; \
|
||||||
|
:doc" \
|
||||||
|
-e "H; \
|
||||||
|
n; \
|
||||||
|
s/^## //; \
|
||||||
|
t doc" \
|
||||||
|
-e "s/:.*//; \
|
||||||
|
G; \
|
||||||
|
s/\\n## /---/; \
|
||||||
|
s/\\n/ /g; \
|
||||||
|
p; \
|
||||||
|
}" ${MAKEFILE_LIST} \
|
||||||
|
| LC_ALL='C' sort --ignore-case \
|
||||||
|
| awk -F '---' \
|
||||||
|
-v ncol=$$(tput cols) \
|
||||||
|
-v indent=19 \
|
||||||
|
-v col_on="$$(tput setaf 6)" \
|
||||||
|
-v col_off="$$(tput sgr0)" \
|
||||||
|
'{ \
|
||||||
|
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
|
||||||
|
n = split($$2, words, " "); \
|
||||||
|
line_length = ncol - indent; \
|
||||||
|
for (i = 1; i <= n; i++) { \
|
||||||
|
line_length -= length(words[i]) + 1; \
|
||||||
|
if (line_length <= 0) { \
|
||||||
|
line_length = ncol - indent - length(words[i]) - 1; \
|
||||||
|
printf "\n%*s ", -indent, " "; \
|
||||||
|
} \
|
||||||
|
printf "%s ", words[i]; \
|
||||||
|
} \
|
||||||
|
printf "\n"; \
|
||||||
|
}' \
|
||||||
|
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
|
|
@ -29,31 +29,11 @@
|
||||||
"source": [
|
"source": [
|
||||||
"This step can take some time depending on your network speed.\n",
|
"This step can take some time depending on your network speed.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Uncomment and run only if you need to download the data the first time: these lines just download the datasets from the deposition on Zenodo containing data for this kit (https://zenodo.org/record/7490192), untar the content and clean up. All the data needed will sit under the `data` folder."
|
"This project follows the guidelines detailed in https://drivendata.github.io/cookiecutter-data-science/ and uses `make` to perform actions.\n",
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"slideshow": {
|
|
||||||
"slide_type": "notes"
|
|
||||||
},
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# import os\n",
|
|
||||||
"# base_url = \"https://zenodo.org/record/7490192/files/\"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
|
"If you need to download the data the first time, open a terminal and run `make data` from the `openaire` folder; `make` will take care of all the steps (i.e., downloading the relevant datasets from Zenodo, https://zenodo.org/record/7490192, untar the content and clean up). \n",
|
||||||
"\n",
|
"\n",
|
||||||
"# items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n",
|
"At the end, the data we need will sit under the `data/raw` folder. Data in here should be immutable; let's keep it that way."
|
||||||
"\n",
|
|
||||||
"# for item in items: \n",
|
|
||||||
"# print(f\"Downloading {item}\")\n",
|
|
||||||
"# os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
|
|
||||||
"# print(f\"Extracting {item}\")\n",
|
|
||||||
"# os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -108,7 +88,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"df = pd.read_json('./data/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
|
"df = pd.read_json('../data/raw/publication/part-00000.txt.gz', compression='gzip', lines=True)\n",
|
||||||
"df.head(2)"
|
"df.head(2)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -128,7 +108,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# files = sorted(glob.glob('./data/publication/part-*.txt.gz'))\n",
|
"# files = sorted(glob.glob('../data/raw/publication/part-*.txt.gz'))\n",
|
||||||
"# publications_df = pd.concat(pd.read_json(f, compression='gzip', lines=True) for f in files)"
|
"# publications_df = pd.concat(pd.read_json(f, compression='gzip', lines=True) for f in files)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -194,7 +174,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"inputPath = 'data/'\n",
|
"inputPath = '../data/raw/'\n",
|
||||||
" \n",
|
" \n",
|
||||||
"publications = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n",
|
"publications = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n",
|
||||||
"datasets = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n",
|
"datasets = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n",
|
||||||
|
@ -222,7 +202,6 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"\n",
|
|
||||||
"publications.createOrReplaceTempView(\"publications\")\n",
|
"publications.createOrReplaceTempView(\"publications\")\n",
|
||||||
"datasets.createOrReplaceTempView(\"datasets\")\n",
|
"datasets.createOrReplaceTempView(\"datasets\")\n",
|
||||||
"softwares.createOrReplaceTempView(\"software\")\n",
|
"softwares.createOrReplaceTempView(\"software\")\n",
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
# local package
|
||||||
|
-e .
|
||||||
|
|
||||||
|
# external requirements
|
||||||
|
click
|
||||||
|
coverage
|
||||||
|
flake8
|
||||||
|
python-dotenv>=0.5.1
|
||||||
|
pandas
|
||||||
|
igraph
|
||||||
|
matplotlib
|
||||||
|
plotly
|
|
@ -0,0 +1,10 @@
|
||||||
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='src',
|
||||||
|
packages=find_packages(),
|
||||||
|
version='0.1.0',
|
||||||
|
description="OpenAIRE Beginner's Kit.",
|
||||||
|
author='Andrea Mannocc, Miriam Baglioni, Sandro La Bruzzo',
|
||||||
|
license='MIT',
|
||||||
|
)
|
|
@ -0,0 +1,62 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
from urllib.request import urlretrieve
|
||||||
|
from urllib.parse import urlsplit
|
||||||
|
import tarfile
|
||||||
|
import click
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from dotenv import find_dotenv, load_dotenv
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
openaire_files = ["https://zenodo.org/record/7490192/files/communities_infrastructures.tar",
|
||||||
|
"https://zenodo.org/record/7490192/files/dataset.tar",
|
||||||
|
"https://zenodo.org/record/7490192/files/datasource.tar",
|
||||||
|
"https://zenodo.org/record/7490192/files/organization.tar",
|
||||||
|
"https://zenodo.org/record/7490192/files/otherresearchproduct.tar",
|
||||||
|
"https://zenodo.org/record/7490192/files/project.tar",
|
||||||
|
"https://zenodo.org/record/7490192/files/publication.tar",
|
||||||
|
"https://zenodo.org/record/7490192/files/relation.tar",
|
||||||
|
"https://zenodo.org/record/7490192/files/software.tar"]
|
||||||
|
|
||||||
|
|
||||||
|
def download_tar(url, path):
|
||||||
|
tar_name = urlsplit(url).path.split('/')[-1] # publication.tar
|
||||||
|
tar_path = os.path.join(path, tar_name) # data/raw/publication.tar
|
||||||
|
untarred_folder = tar_name.split('.')[0] # publication
|
||||||
|
untarred_path = os.path.join(path, untarred_folder) # data/raw/publication
|
||||||
|
if not os.path.exists(untarred_path):
|
||||||
|
if not os.path.exists(tar_path):
|
||||||
|
logger.info('downloading %s' % url)
|
||||||
|
urlretrieve(url, tar_path)
|
||||||
|
|
||||||
|
logger.info('untar %s' % tar_name)
|
||||||
|
with tarfile.open(tar_path, "r") as tar:
|
||||||
|
tar.extractall(path)
|
||||||
|
|
||||||
|
logger.info('cleaning')
|
||||||
|
os.remove(tar_path)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('output_filepath', type=click.Path(exists=True))
|
||||||
|
def main(output_filepath):
|
||||||
|
""" Downloads data into /data/raw
|
||||||
|
"""
|
||||||
|
for tar in openaire_files:
|
||||||
|
download_tar(tar, output_filepath)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
logging.basicConfig(level=logging.INFO, format=log_fmt)
|
||||||
|
|
||||||
|
# not used in this stub but often useful for finding various files
|
||||||
|
project_dir = Path(__file__).resolve().parents[2]
|
||||||
|
|
||||||
|
# find .env automagically by walking up directories until it's found, then
|
||||||
|
# load up the .env entries as environment variables
|
||||||
|
load_dotenv(find_dotenv())
|
||||||
|
|
||||||
|
main()
|
|
@ -0,0 +1,25 @@
|
||||||
|
import sys
|
||||||
|
|
||||||
|
REQUIRED_PYTHON = "python3"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
system_major = sys.version_info.major
|
||||||
|
if REQUIRED_PYTHON == "python":
|
||||||
|
required_major = 2
|
||||||
|
elif REQUIRED_PYTHON == "python3":
|
||||||
|
required_major = 3
|
||||||
|
else:
|
||||||
|
raise ValueError("Unrecognized python interpreter: {}".format(
|
||||||
|
REQUIRED_PYTHON))
|
||||||
|
|
||||||
|
if system_major != required_major:
|
||||||
|
raise TypeError(
|
||||||
|
"This project requires Python {}. Found: Python {}".format(
|
||||||
|
required_major, sys.version))
|
||||||
|
else:
|
||||||
|
print(">>> Development environment passes all tests!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue