commit dcb8dbc4bd4ddebeb43c2ae34357bf7e3e1d1cca Author: Andrea Mannocci Date: Fri Jul 2 17:49:38 2021 +0200 first import diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d7c9832 --- /dev/null +++ b/.gitignore @@ -0,0 +1,89 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# DotEnv configuration +.env + +# Database +*.db +*.rdb + +# Pycharm +.idea + +# VS Code +.vscode/ + +# Spyder +.spyproject/ + +# Jupyter NB Checkpoints +.ipynb_checkpoints/ + +# exclude data from source control by default +/data/ + +# Mac OS-specific storage files +.DS_Store + +# vim +*.swp +*.swo + +# Mypy cache +.mypy_cache/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..453f5be --- /dev/null +++ b/LICENSE @@ -0,0 +1,10 @@ + +The MIT License (MIT) +Copyright (c) 2021, Andrea Mannocci + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8d5193e --- /dev/null +++ b/Makefile @@ -0,0 +1,144 @@ +.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3 + +################################################################################# +# GLOBALS # +################################################################################# + +PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') +PROFILE = default +PROJECT_NAME = data-registries +PYTHON_INTERPRETER = python3 + +ifeq (,$(shell which conda)) +HAS_CONDA=False +else +HAS_CONDA=True +endif + +################################################################################# +# COMMANDS # +################################################################################# + +## Install Python Dependencies +requirements: test_environment + $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel + $(PYTHON_INTERPRETER) -m pip install -r requirements.txt + +## Make Dataset +data: requirements + $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed + +## Delete all compiled Python files +clean: + find . -type f -name "*.py[co]" -delete + find . -type d -name "__pycache__" -delete + +## Lint using flake8 +lint: + flake8 src + +## Upload Data to S3 +sync_data_to_s3: +ifeq (default,$(PROFILE)) + aws s3 sync data/ s3://$(BUCKET)/data/ +else + aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE) +endif + +## Download Data from S3 +sync_data_from_s3: +ifeq (default,$(PROFILE)) + aws s3 sync s3://$(BUCKET)/data/ data/ +else + aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE) +endif + +## Set up python interpreter environment +create_environment: +ifeq (True,$(HAS_CONDA)) + @echo ">>> Detected conda, creating conda environment." +ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) + conda create --name $(PROJECT_NAME) python=3 +else + conda create --name $(PROJECT_NAME) python=2.7 +endif + @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" +else + $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper + @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ + export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" + @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" + @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" +endif + +## Test python environment is setup correctly +test_environment: + $(PYTHON_INTERPRETER) test_environment.py + +################################################################################# +# PROJECT RULES # +################################################################################# + + + +################################################################################# +# Self Documenting Commands # +################################################################################# + +.DEFAULT_GOAL := help + +# Inspired by +# sed script explained: +# /^##/: +# * save line in hold space +# * purge line +# * Loop: +# * append newline + line to hold space +# * go to next line +# * if line starts with doc comment, strip comment character off and loop +# * remove target prerequisites +# * append hold space (+ newline) to line +# * replace newline plus comments by `---` +# * print line +# Separate expressions are necessary because labels cannot be delimited by +# semicolon; see +.PHONY: help +help: + @echo "$$(tput bold)Available rules:$$(tput sgr0)" + @echo + @sed -n -e "/^## / { \ + h; \ + s/.*//; \ + :doc" \ + -e "H; \ + n; \ + s/^## //; \ + t doc" \ + -e "s/:.*//; \ + G; \ + s/\\n## /---/; \ + s/\\n/ /g; \ + p; \ + }" ${MAKEFILE_LIST} \ + | LC_ALL='C' sort --ignore-case \ + | awk -F '---' \ + -v ncol=$$(tput cols) \ + -v indent=19 \ + -v col_on="$$(tput setaf 6)" \ + -v col_off="$$(tput sgr0)" \ + '{ \ + printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ + n = split($$2, words, " "); \ + line_length = ncol - indent; \ + for (i = 1; i <= n; i++) { \ + line_length -= length(words[i]) + 1; \ + if (line_length <= 0) { \ + line_length = ncol - indent - length(words[i]) - 1; \ + printf "\n%*s ", -indent, " "; \ + } \ + printf "%s ", words[i]; \ + } \ + printf "\n"; \ + }' \ + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/README.md b/README.md new file mode 100644 index 0000000..522d115 --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +data-registries +============================== + +A short description of the project. + +Project Organization +------------ + + ├── LICENSE + ├── Makefile <- Makefile with commands like `make data` or `make train` + ├── README.md <- The top-level README for developers using this project. + ├── data + │   ├── external <- Data from third party sources. + │   ├── interim <- Intermediate data that has been transformed. + │   ├── processed <- The final, canonical data sets for modeling. + │   └── raw <- The original, immutable data dump. + │ + ├── docs <- A default Sphinx project; see sphinx-doc.org for details + │ + ├── models <- Trained and serialized models, model predictions, or model summaries + │ + ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), + │ the creator's initials, and a short `-` delimited description, e.g. + │ `1.0-jqp-initial-data-exploration`. + │ + ├── references <- Data dictionaries, manuals, and all other explanatory materials. + │ + ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. + │   └── figures <- Generated graphics and figures to be used in reporting + │ + ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g. + │ generated with `pip freeze > requirements.txt` + │ + ├── setup.py <- makes project pip installable (pip install -e .) so src can be imported + ├── src <- Source code for use in this project. + │   ├── __init__.py <- Makes src a Python module + │ │ + │   ├── data <- Scripts to download or generate data + │   │   └── make_dataset.py + │ │ + │   ├── features <- Scripts to turn raw data into features for modeling + │   │   └── build_features.py + │ │ + │   ├── models <- Scripts to train models and then use trained models to make + │ │ │ predictions + │   │   ├── predict_model.py + │   │   └── train_model.py + │ │ + │   └── visualization <- Scripts to create exploratory and results oriented visualizations + │   └── visualize.py + │ + └── tox.ini <- tox file with settings for running tox; see tox.readthedocs.io + + +-------- + +

Project based on the cookiecutter data science project template. #cookiecutterdatascience

diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..3cab74d --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,153 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/data-registries.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/data-registries.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/data-registries" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/data-registries" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/docs/commands.rst b/docs/commands.rst new file mode 100644 index 0000000..2d162f3 --- /dev/null +++ b/docs/commands.rst @@ -0,0 +1,10 @@ +Commands +======== + +The Makefile contains the central entry points for common tasks related to this project. + +Syncing data to S3 +^^^^^^^^^^^^^^^^^^ + +* `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`. +* `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..317865d --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- +# +# data-registries documentation build configuration file, created by +# sphinx-quickstart. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import os +import sys + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'data-registries' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'data-registriesdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', + 'data-registries.tex', + u'data-registries Documentation', + u"Andrea Mannocci", 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# If true, show page references after internal links. +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'data-registries', u'data-registries Documentation', + [u"Andrea Mannocci"], 1) +] + +# If true, show URL addresses after external links. +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'data-registries', u'data-registries Documentation', + u"Andrea Mannocci", 'data-registries', + 'A short description of the project.', 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# texinfo_appendices = [] + +# If false, no module index is generated. +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# texinfo_show_urls = 'footnote' diff --git a/docs/getting-started.rst b/docs/getting-started.rst new file mode 100644 index 0000000..b4f71c3 --- /dev/null +++ b/docs/getting-started.rst @@ -0,0 +1,6 @@ +Getting started +=============== + +This is where you describe how to get set up on a clean install, including the +commands necessary to get the raw data (using the `sync_data_from_s3` command, +for example), and then how to make the cleaned, final data sets. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..92a1ec3 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,24 @@ +.. data-registries documentation master file, created by + sphinx-quickstart. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +data-registries documentation! +============================================== + +Contents: + +.. toctree:: + :maxdepth: 2 + + getting-started + commands + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..7cf5761 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,190 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\data-registries.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\data-registries.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +:end diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/01-Explorative.ipynb b/notebooks/01-Explorative.ipynb new file mode 100644 index 0000000..af4a892 --- /dev/null +++ b/notebooks/01-Explorative.ipynb @@ -0,0 +1,8021 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import csv\n", + "import json\n", + "import reverse_geocoder as rg\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import plotly\n", + "from plotly.offline import iplot, init_notebook_mode\n", + "import plotly.graph_objs as go\n", + "import plotly.express as px" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FAIRsharing" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
0GenBankGenBankhttps://fairsharing.org/10.25504/FAIRsharing.9...https://www.ncbi.nlm.nih.gov/genbank/[European Union, Japan, United States][Bioinformatics, Data Management, Data Submiss...
1GlycoNAVIGlycoNAVIhttps://fairsharing.org/10.25504/FAIRsharing.w...https://glyconavi.org/[Japan][Chemistry, Glycomics, Life Science, Organic C...
2ADHDgeneADHDgenehttps://fairsharing.org/10.25504/FAIRsharing.m...http://adhd.psych.ac.cn/[China][Biomedical Science, Genetics]
3Allele frequency resource for research and tea...ALFREDhttps://fairsharing.org/10.25504/FAIRsharing.y...http://alfred.med.yale.edu[United States][Life Science]
4Animal Transcription Factor DatabaseAnimalTFDBhttps://fairsharing.org/10.25504/FAIRsharing.e...http://bioinfo.life.hust.edu.cn/AnimalTFDB/[China][Life Science]
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "0 GenBank GenBank \n", + "1 GlycoNAVI GlycoNAVI \n", + "2 ADHDgene ADHDgene \n", + "3 Allele frequency resource for research and tea... ALFRED \n", + "4 Animal Transcription Factor Database AnimalTFDB \n", + "\n", + " fs_url \\\n", + "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n", + "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n", + "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n", + "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n", + "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n", + "\n", + " url \\\n", + "0 https://www.ncbi.nlm.nih.gov/genbank/ \n", + "1 https://glyconavi.org/ \n", + "2 http://adhd.psych.ac.cn/ \n", + "3 http://alfred.med.yale.edu \n", + "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n", + "\n", + " countries \\\n", + "0 [European Union, Japan, United States] \n", + "1 [Japan] \n", + "2 [China] \n", + "3 [United States] \n", + "4 [China] \n", + "\n", + " subjects \n", + "0 [Bioinformatics, Data Management, Data Submiss... \n", + "1 [Chemistry, Glycomics, Life Science, Organic C... \n", + "2 [Biomedical Science, Genetics] \n", + "3 [Life Science] \n", + "4 [Life Science] " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n", + " delimiter='|', header=0,\n", + " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n", + "fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')\n", + "fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')\n", + "fairsharing_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
count175217521752175217491690
unique1752174117521752178834
topThe Cardiovascular Research GridCGDhttps://fairsharing.org/bsg-d001750http://www.bmrb.wisc.edu/[United States][Life Science]
freq1311588367
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "count 1752 1752 \n", + "unique 1752 1741 \n", + "top The Cardiovascular Research Grid CGD \n", + "freq 1 3 \n", + "\n", + " fs_url url \\\n", + "count 1752 1752 \n", + "unique 1752 1752 \n", + "top https://fairsharing.org/bsg-d001750 http://www.bmrb.wisc.edu/ \n", + "freq 1 1 \n", + "\n", + " countries subjects \n", + "count 1749 1690 \n", + "unique 178 834 \n", + "top [United States] [Life Science] \n", + "freq 588 367 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "Life Science", + "Biomedical Science", + "Earth Science", + "Genomics", + "Environmental Science", + "Oceanography", + "Biodiversity", + "Atmospheric Science", + "Epidemiology", + "Genetics", + "Health Science", + "Virology", + "Biology", + "Proteomics", + "Bioinformatics", + "Agriculture", + "Geology", + "Preclinical Studies", + "Transcriptomics", + "Chemistry", + "Comparative Genomics", + "Data Management", + "Clinical Studies", + "Botany", + "Functional Genomics", + "Medicine", + "Geophysics", + "Meteorology", + "Humanities and Social Sciences", + "Natural Science", + "Social Science", + "Systems Biology", + "Geography", + "Ecology", + "Data Submission", + " Annotation and Curation", + "Metabolomics", + "Engineering Science", + "Marine Biology", + "Physics", + "Economics", + "Hydrology", + "Ontology and Terminology", + "Biochemistry", + "Astrophysics and Astronomy", + "Phylogenetics", + "Molecular biology", + "Epigenetics", + "Medical Virology", + "Remote Sensing", + "Infectious Disease Medicine", + "Immunology", + "Humanities", + "Anatomy", + "Computational Biology", + "Structural Biology", + "Neurobiology", + "Plant Genetics", + "Computer Science", + "Public Health", + "Knowledge and Information Systems", + "Microbiology", + "Demographics", + "Social and Behavioural Science", + "Data Visualization", + "Oncology", + "Developmental Biology", + "Critical Care Medicine", + "Hydrogeology", + "Data Integration", + "Glycomics", + "Ecosystem Science", + "Soil Science", + "Geochemistry", + "Population Genetics", + "Drug Discovery", + "Materials Science", + "Water Research", + "Neuroscience", + "Forest Management", + "Plant Breeding", + "Metagenomics", + "Energy Engineering", + "Water Management", + "Paleontology", + "Software Engineering", + "Geodesy", + "Taxonomy", + "Cell Biology", + "Phylogenomics", + "Immunogenetics", + "Pharmacology", + "Mineralogy", + "Freshwater Science", + "Medical Informatics", + "Statistics", + "Epigenomics", + "Human Genetics", + "Phylogeny", + "Global Health", + "Animal Genetics", + "Cheminformatics", + "Evolutionary Biology", + "Zoology", + "Mathematics", + "Microbial Ecology", + "Population Dynamics", + "Political Science", + "Nanotechnology", + "Psychology", + "Physical Geography", + "Education Science", + "Drug Development", + "Culture", + "Translational Medicine", + "Pathology", + "Food Security", + "Informatics", + "Neurophysiology", + "Natural History", + "Phenomics", + "Nutritional Science", + "Computational Neuroscience", + "Biotechnology", + "Bioengineering", + "Geoinformatics", + "Data Governance", + "Cartography", + "History", + "Analytical Chemistry", + "Organic Chemistry", + "Urban Planning", + "Plant Anatomy", + "Enzymology", + "Classical Archaeology", + "Animal Husbandry", + "Maritime Engineering", + "Materials Engineering", + "Database Management", + "Cardiology", + "Anthropology", + "Architecture", + "Transportation Planning", + "Criminology", + "Primary Health Care", + "Molecular Genetics", + "Toxicology", + "Omics", + "Communication Science", + "Agronomy", + "Physiology", + "Art", + "Endocrinology", + "Fisheries Science", + "Economic and Social History", + "Drug Metabolism", + "Thermodynamics", + "Plant Ecology", + "Tropical Medicine", + "Aerospace Engineering", + "Data Quality", + "Chemical Engineering", + "Data Mining", + "Health Services Research", + "Linguistics", + "Medicinal Chemistry", + "Agricultural Engineering", + "Geriatric Medicine", + "Toxicogenomics", + "Drug Repositioning", + "Reproductive Health", + "Materials Informatics", + "Construction Engineering", + "Entomology", + "Aquaculture", + "Pediatrics", + "Agroecology", + "Civil Engineering", + "Inorganic Molecular Chemistry", + "Business Administration", + "Respiratory Medicine", + "Embryology", + "Molecular Microbiology", + "Power Engineering", + "Composite Materials", + "Molecular Infection Biology", + "Computational Chemistry", + "Synthetic Chemistry", + "Synthetic Biology", + "Building Engineering Physics", + "Farming Systems Research", + "Biomaterials", + "Pharmacy", + "Veterinary Medicine", + "Gastroenterology", + "Structural Genomics", + "Pharmacogenomics", + "Occupational Medicine", + "Community Care", + "Molecular Dynamics", + "Fine Arts", + "Ancient Cultures", + "Human Geography", + "Molecular Chemistry", + "Quantitative Genetics", + " Learning and Training", + "Human Biology", + "Rural and Agricultural Sociology", + "Social Policy", + "Social Psychology", + "Industrial Engineering", + "Jurisprudence", + "Research on Teaching", + "Limnology", + "Agricultural Economics", + "Historical Linguistics", + "Data Security", + "Prehistory", + "Geotechnics", + "Cultural Studies", + "Public Finance", + "Art History", + "Proteogenomics", + "Digital Image Processing", + "Surgery", + "Plant Cell Biology", + " Optical and Plasma Physics", + "Safety Science", + "Traditional Medicine", + " Molecular", + "Process Engineering", + "Rheumatology", + "Telecommunication Engineering", + "Plastics Engineering", + "Acoustics", + "Plant Cultivation", + "Religious Studies", + "Policy", + "Systemic Neuroscience", + "Agricultural Law", + "Technical Chemistry", + "Public Law", + "Radiology", + "Synthesis Chemistry", + "Chemical Biology", + "Physical Chemistry", + "Horticulture", + "Hematology", + "Gynecology", + "Artificial Intelligence", + "Atomic", + "Behavioural Biology", + "Biological Process Engineering", + "Functional Materials Research", + "Biological Psychology", + "Food Process Engineering", + "Biomimetic Chemistry", + "Biophysics", + "Empirical Social Research", + "Electrophysiology", + "Electrical Engineering", + "Biotherapeutics", + "Economic Theory", + "Economic Policy", + "Building Design", + "Developmental Neurobiology", + "Dermatology", + "Criminal Law", + "Component Engineering", + "Comparative Neurobiology", + "Cognitive Neuroscience", + "Clinical Veterinary Medicine", + "Clinical Psychology", + "Clinical Chemistry", + "Classical Philology", + "Cellular Neuroscience", + "History of Science", + "Human-Machine Systems Engineering", + "Photogrammetry", + "Hydraulic Engineering", + "Philosophy", + "Personalized Medicine", + "Parasitology", + "Organic Molecular Chemistry", + "Ophthalmology", + "Obstetrics", + "Neurology", + "Musculoskeletal Medicine", + "Animal Breeding", + "Molecular Physical Chemistry", + "Molecular Neuroscience", + "Microstructural Mechanical Properties of Materials", + "Microbial Physiology", + "Microbial Genetics", + "Metal-Cutting Manufacturing Engineering", + "Medicines Research and Development", + "Animal Physiology", + "Medical Physics", + "Media Studies", + "Mechanics", + "Mechanical Process Engineering", + "Mechanical Engineering", + "Materials Structuring and Functionalisation", + "Applied Linguistics", + "Logistics Engineering", + "Literary Studies", + "Applied Mathematics", + "Landscape Planning", + "Applied Microbiology", + "Mechanical Behaviour of Construction Materials" + ], + "y": [ + 900, + 252, + 227, + 166, + 134, + 95, + 80, + 78, + 75, + 73, + 67, + 66, + 65, + 61, + 60, + 58, + 51, + 48, + 48, + 46, + 46, + 45, + 45, + 44, + 42, + 42, + 41, + 40, + 39, + 38, + 36, + 35, + 33, + 32, + 31, + 31, + 30, + 29, + 27, + 26, + 26, + 25, + 25, + 25, + 25, + 20, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 18, + 17, + 17, + 17, + 17, + 17, + 16, + 16, + 16, + 16, + 16, + 15, + 15, + 15, + 15, + 14, + 14, + 14, + 14, + 14, + 14, + 13, + 13, + 13, + 13, + 13, + 13, + 13, + 12, + 12, + 12, + 12, + 11, + 11, + 11, + 11, + 11, + 11, + 10, + 10, + 10, + 9, + 9, + 9, + 8, + 8, + 8, + 8, + 8, + 8, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "Fairsharing subject coverage" + }, + "xaxis": { + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fairsharing_subjects = fairsharing_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)\n", + "\n", + "data = [\n", + " go.Bar(\n", + " x=fairsharing_subjects.index,\n", + " y=fairsharing_subjects['url']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Fairsharing subject coverage',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "United States", + "United Kingdom", + "Germany", + "France", + "Switzerland", + "China", + "Netherlands", + "Italy", + "Canada", + "Belgium", + "Spain", + "Japan", + "Sweden", + "Czech Republic", + "Norway", + "Denmark", + "European Union", + "Austria", + "Finland", + "Republic of Ireland", + "Australia", + "Israel", + "Portugal", + "Hungary", + "Greece", + "Malta", + "Lithuania", + "Slovakia", + "Iceland", + "Luxembourg", + "Montenegro", + "Croatia", + "Worldwide", + "India", + "Poland", + "Singapore", + "South Korea", + "Russia", + "South Africa", + "Taiwan", + "Brazil", + "New Zealand", + "Mexico", + "Saudi Arabia", + "Bulgaria", + "Hong Kong", + "Argentina", + "Turkey", + "Cyprus", + "Morocco", + "Uganda", + "Estonia", + "Romania", + "Thailand", + "Pakistan", + "Costa Rica", + "Uruguay", + "United Arab Emirates", + "Togo", + "Antarctica", + "Panama", + "Honduras", + "Benin", + "Cameroon", + "Chile", + "Colombia", + "Egypt", + "El Salvador", + "Ethiopia", + "Faroe Islands", + "Greenland", + "Indonesia", + "Nigeria", + "Kenya", + "Latvia", + "Madagascar", + "Malawi", + "Mali", + "Mauritania", + "Mozambique", + "Nicaragua", + "Niger", + "Zimbabwe" + ], + "y": [ + 686, + 248, + 192, + 162, + 114, + 99, + 96, + 91, + 86, + 83, + 83, + 80, + 76, + 71, + 69, + 67, + 66, + 64, + 63, + 62, + 62, + 61, + 60, + 59, + 58, + 53, + 52, + 52, + 52, + 52, + 51, + 51, + 49, + 32, + 11, + 10, + 10, + 9, + 9, + 8, + 8, + 8, + 8, + 6, + 3, + 3, + 3, + 3, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "Fairsharing country coverage" + }, + "xaxis": { + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fairsharing_countries = fairsharing_df.explode('countries').groupby('countries')[['url']].count().sort_values('url', ascending=False)\n", + "\n", + "data = [\n", + " go.Bar(\n", + " x=fairsharing_countries.index,\n", + " y=fairsharing_countries['url']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Fairsharing country coverage',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# re3data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
410|re3data_____::3f2e20af26ead0432f5470d8b739638dhttp://planttfdb.cbi.pku.edu.cn/Plant Transcription Factor DatabasePlantTFDBNaN0.00.0['Life Sciences', 'Basic Biological and Medica...
710|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfchttps://spdf.gsfc.nasa.gov/Space Physics Data FacilityNASA's Space Physics Data Facility SPDFNaN0.00.0['Natural Sciences', 'Astrophysics and Astrono...
1310|re3data_____::59521daca59ac29b811343cc4cd370cfhttp://card.westgis.ac.cn/Cold and Arid Regions Science Data Center at L...CARD WDC for Glaciology and Geocryology World ...NaN0.00.0['Natural Sciences', 'Geosciences (including G...
1410|re3data_____::ec1ba1674c852466c266acb64c618d15https://www.psycharchives.org/PsycharchivesNaNNaN0.00.0['Humanities and Social Sciences', 'Psychology...
1910|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76https://www.ihfc-iugg.org/products/global-heat...The Global Heat Flow Database of the Internati...International Heat-flow DatabaseNaN0.00.0['Natural Sciences', 'Geology and Palaeontolog...
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "4 10|re3data_____::3f2e20af26ead0432f5470d8b739638d \n", + "7 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc \n", + "13 10|re3data_____::59521daca59ac29b811343cc4cd370cf \n", + "14 10|re3data_____::ec1ba1674c852466c266acb64c618d15 \n", + "19 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 \n", + "\n", + " url \\\n", + "4 http://planttfdb.cbi.pku.edu.cn/ \n", + "7 https://spdf.gsfc.nasa.gov/ \n", + "13 http://card.westgis.ac.cn/ \n", + "14 https://www.psycharchives.org/ \n", + "19 https://www.ihfc-iugg.org/products/global-heat... \n", + "\n", + " official_name \\\n", + "4 Plant Transcription Factor Database \n", + "7 Space Physics Data Facility \n", + "13 Cold and Arid Regions Science Data Center at L... \n", + "14 Psycharchives \n", + "19 The Global Heat Flow Database of the Internati... \n", + "\n", + " english_name description latitude \\\n", + "4 PlantTFDB NaN 0.0 \n", + "7 NASA's Space Physics Data Facility SPDF NaN 0.0 \n", + "13 CARD WDC for Glaciology and Geocryology World ... NaN 0.0 \n", + "14 NaN NaN 0.0 \n", + "19 International Heat-flow Database NaN 0.0 \n", + "\n", + " longitude subjects \n", + "4 0.0 ['Life Sciences', 'Basic Biological and Medica... \n", + "7 0.0 ['Natural Sciences', 'Astrophysics and Astrono... \n", + "13 0.0 ['Natural Sciences', 'Geosciences (including G... \n", + "14 0.0 ['Humanities and Social Sciences', 'Psychology... \n", + "19 0.0 ['Natural Sciences', 'Geology and Palaeontolog... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n", + "re3data_df = re3data_df[re3data_df.id.str.contains('re3data')]\n", + "re3data_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 ['Life Sciences', 'Basic Biological and Medica...\n", + "7 ['Natural Sciences', 'Astrophysics and Astrono...\n", + "13 ['Natural Sciences', 'Geosciences (including G...\n", + "14 ['Humanities and Social Sciences', 'Psychology...\n", + "19 ['Natural Sciences', 'Geology and Palaeontolog...\n", + " ... \n", + "8693 ['Life Sciences', 'Basic Biological and Medica...\n", + "8695 ['Natural Sciences', 'Atmospheric Science and ...\n", + "8697 ['Natural Sciences', 'Atmospheric Science and ...\n", + "8699 ['Natural Sciences', 'Atmospheric Science and ...\n", + "8705 ['Life Sciences', 'Plant Sciences', 'Plant Gen...\n", + "Name: subjects, Length: 2693, dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df.subjects" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def merge_lists(lists):\n", + " res = []\n", + " for l in lists:\n", + " res = res + l\n", + " return res\n", + "\n", + "re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n", + " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n", + " .reset_index()\\\n", + " .groupby('index')[0].apply(lambda x: merge_lists(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "index\n", + "4 [Life Sciences, Basic Biological, Medical Rese...\n", + "7 [Natural Sciences, Astrophysics, Astronomy, Ph...\n", + "13 [Natural Sciences, Geosciences (including Geog...\n", + "14 [Humanities, Social Sciences, Psychology, Soci...\n", + "19 [Natural Sciences, Geology, Palaeontology, Geo...\n", + " ... \n", + "8693 [Life Sciences, Basic Biological, Medical Rese...\n", + "8695 [Natural Sciences, Atmospheric Science, Oceano...\n", + "8697 [Natural Sciences, Atmospheric Science, Oceano...\n", + "8699 [Natural Sciences, Atmospheric Science, Oceano...\n", + "8705 [Life Sciences, Plant Sciences, Plant Genetics...\n", + "Name: 0, Length: 2693, dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_cleaned_subjects" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df = re3data_df.join(re3data_cleaned_subjects)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df.drop(columns=['subjects'], inplace=True)\n", + "re3data_df.rename(columns={0:'subjects'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
count2693267326932034385.0000005.0000002693
unique269326612668201038NaNNaN1427
top10|re3data_____::e59f89142e8d47d32523c53a9137f07bhttp://iubio.bio.indiana.edu/IUBio-ArchiveResearch Data RepositoryIUBio Archive is an archive of biology data an...NaNNaN[Humanities, Social Sciences, Life Sciences, N...
freq12221NaNNaN209
meanNaNNaNNaNNaNNaN61.66811336.623678NaN
stdNaNNaNNaNNaNNaN96.98445748.547521NaN
minNaNNaNNaNNaNNaN12.12300012.123000NaN
25%NaNNaNNaNNaNNaN12.12300012.123400NaN
50%NaNNaNNaNNaNNaN12.12340012.123400NaN
75%NaNNaNNaNNaNNaN37.97116323.748590NaN
maxNaNNaNNaNNaNNaN234.000000123.000000NaN
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "count 2693 \n", + "unique 2693 \n", + "top 10|re3data_____::e59f89142e8d47d32523c53a9137f07b \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " url official_name \\\n", + "count 2673 2693 \n", + "unique 2661 2668 \n", + "top http://iubio.bio.indiana.edu/ IUBio-Archive \n", + "freq 2 2 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " english_name \\\n", + "count 2034 \n", + "unique 2010 \n", + "top Research Data Repository \n", + "freq 2 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " description latitude \\\n", + "count 38 5.000000 \n", + "unique 38 NaN \n", + "top IUBio Archive is an archive of biology data an... NaN \n", + "freq 1 NaN \n", + "mean NaN 61.668113 \n", + "std NaN 96.984457 \n", + "min NaN 12.123000 \n", + "25% NaN 12.123000 \n", + "50% NaN 12.123400 \n", + "75% NaN 37.971163 \n", + "max NaN 234.000000 \n", + "\n", + " longitude subjects \n", + "count 5.000000 2693 \n", + "unique NaN 1427 \n", + "top NaN [Humanities, Social Sciences, Life Sciences, N... \n", + "freq NaN 209 \n", + "mean 36.623678 NaN \n", + "std 48.547521 NaN \n", + "min 12.123000 NaN \n", + "25% 12.123400 NaN \n", + "50% 12.123400 NaN \n", + "75% 23.748590 NaN \n", + "max 123.000000 NaN " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df.describe(include='all')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "Life Sciences", + "Natural Sciences", + "Humanities", + "Social Sciences", + "Medicine", + "Biology", + "Geosciences (including Geography)", + "Oceanography", + "Atmospheric Science", + "Basic Biological", + "Medical Research", + "Engineering Sciences", + "Social", + "Behavioural Sciences", + "Geodesy", + "Geophysics", + " Horticulture", + " Forestry", + "Agriculture", + "Veterinary Medicine", + "Physics", + "Zoology", + "Immunology", + "Computer Science", + "Chemistry", + "General Genetics", + "Microbiology", + " Virology", + "Plant Sciences", + "Astronomy", + "Astrophysics", + "Economics", + "Bioinformatics", + "Theoretical Biology", + "Geography", + "Water Research", + "System Engineering", + " Electrical", + " Health Services Research", + "Public Health", + " Social Medicine", + "Human Genetics", + "Geochemistry", + "Crystallography", + " Mineralogy", + "Developmental Biology", + "Empirical Social Research", + "Architecture", + "Linguistics", + " Geoinformatics", + " Remote Sensing", + " Cartogaphy", + " Photogrammetry", + "Palaeontology", + "Geology", + " Quantum Optics", + "History", + "Optics", + " Molecules", + "Animal Genetics", + " Cell", + "Construction Engineering", + "Cell Biology", + "Neurosciences", + "Ecosystem Research", + " Biodiversity", + "Biochemistry", + "Animal Ecology", + "Media Studies", + "Physics of Atoms", + "Plasmas", + " Music", + "Fine Arts", + " Theatre", + "Plant Ecology", + "Ecosystem Analysis", + "Plant Genetics", + "Ancient Cultures", + "Materials Science", + "Cultural Anthropology", + "Particles", + " Nuclei", + "Fields", + "Economic", + "Statistics", + "Econometrics", + "Education Sciences", + "Epidemiology", + " Medical Informatics", + " Medical Biometry", + "Political Science", + "Religious Studies", + "Jurisprudence", + "Social Policy", + " Hydrology", + " Integrated Water Resources Management", + "Hydrogeology", + " Urban Water Management", + " Limnology", + " Water Chemistry", + "Structural Biology", + " Social", + " Jewish Studies", + "Cultures", + "Non-European Languages", + "Molecular Chemistry", + "Mathematics", + "Engineering", + "Psychology", + "Evolution", + " Biochemistry", + "Genetics of Microorganisms", + "Metabolism", + "Soil Sciences", + "Theoretical Chemistry", + "Physical", + "Ecology of Agricultural Landscapes", + "Analytical Chemistry", + " Method Development (Chemistry)", + "Condensed Matter Physics", + "Pharmacology", + "Systems Engineering", + "Human Geography", + "Literary Studies", + "Liquids - Spectroscopy", + " Interfaces", + "Food Chemistry", + "Biophysics", + "Basic Forest Research", + " Kinetics", + "Physical Chemistry of Molecules", + " Image", + "Language Processing", + "Surface Research", + "Artificial Intelligence", + "Chemical Solid State", + "Physical Geography", + " Atoms", + "Biological Chemistry", + " Anthropology", + " Plasmas", + "Ethnology/Folklore", + "Art History", + "Thermal Engineering/Process Engineering", + "Sociology", + "Agricultural Economics", + "Theology", + "Virology", + "Cognitive Neuroscience", + "Urbanism", + " Transportation", + "Infrastructure Planning", + "Neuroimaging", + "Musicology", + " Landscape Planning", + " Spatial Planning", + "Modern", + " Transfusion Medicine", + " Oncology", + "Medical Physics", + "Plant Systematics", + "Current History", + "Hematology", + "Biomedical Technology", + " Building", + "Medical Microbiology", + " Building Design", + " Sustainable Building Technology", + "Construction History", + " Molecular Infection Biology", + "Morphology", + "Traffic", + "Electrical Engineering", + "Systematics", + "Transport Systems", + " Logistics", + "Communication Science", + "Anatomy", + "Pharmacy", + "Use of Forest Resources", + "Classical Archaeology", + "Business Administration", + "Inventory Control", + "Public Finance", + "Occupational Medicine", + "Toxicology", + "Social History", + "Physiology", + "Plant Biochemistry", + "Training", + "Plant Breeding", + "Mechanical", + "History of Science", + "Pediatric", + "Research on Teaching", + "Process Engineering", + "industrial Engineering", + " Learning", + " Technical Chemistry", + "Adolescent Medicine", + "Heat Energy Technology", + " Legal History", + " Legal Theory", + "Plant Cultivation", + "Radiology", + " Fluid Mechanics", + " Thermal Machines", + "Legal", + "Political Philosophy", + "Nuclear Medicine", + "Systemic Neuroscience", + "Microbial Ecology", + "Animal Physiology", + "Applied Microbiology", + " Behaviour", + " Computational Neuroscience", + "Jewish Studies", + "Materials Engineering", + "Organic Molecular Chemistry", + "General", + "Research on Socialization", + "Professions", + "Public Law", + "Educational Institutions", + "Software Technology", + "Typology", + "Basic Veterinary Medical Science", + "Medieval History", + "Early Modern History", + "Surfaces", + " Material Characterisation", + "Philosophy", + "Forensic Medicine", + " Historical Linguistics", + " Non-European Languages", + "Physical Chemistry of Solids", + "Pathology", + " Soft Matter", + "Modelling", + " Metabolism", + " Biological Physics", + "Theory", + "Ancient History", + "Ancient Near Eastern Studies", + "Molecular Neuroscience", + "Agricultural", + "Neurogenetics", + " Nonlinear Dynamics", + "Prehistory", + "Statistical Physics", + "Egyptology", + "Criminology", + "Inorganic Molecular Chemistry", + "Gastroenterology", + "Food Process Engineering", + "Obstetrics", + "Constructive Mechanical Engineering", + "Acoustics", + "Human Factors", + "Gynaecology", + "Oceania Studies", + "Electrical Energy Generation", + " Distribution", + " Ergonomics", + "Sensory", + " Material Synthesis", + "Mechanics", + "Plant Cell", + "Solid State", + " Human-Machine Systems", + "Biological", + "Biomimetic Chemistry", + " American", + "Surface Chemistry", + " Application", + "African", + "Behavioural Biology", + "Therapy", + " Methodology", + " Medical Psychology", + " Semitic Studies", + "Plant Nutrition", + "Radiobiology", + "Clinical Veterinary Medicine", + "Operating", + "Polymer Research", + "General Theoretical Chemistry", + " Communication", + "Islamic Studies", + "Nutritional Sciences", + " Breeding", + "Radiation Oncology", + " Clinical Psychology", + "Endocrinology", + "Theatre", + "Differential Psychology", + "Applied Linguistics", + " Arabian Studies", + " Diagnostics", + "Hygiene", + "Animal Husbandry", + "Information Systems", + "Basic Research on Pathogenesis", + "Individual Linguistics", + " Diabetology", + "History of Education", + "European", + "Developmental", + "Asian Studies", + "Plant Physiology", + "Energy Process Engineering", + "Experimental Condensed Matter Physics", + " High-Frequency", + "General Education", + " Industrial", + "Protestant Theology", + "Network Technology", + "Communication", + " Theoretical Electrical Engineering", + "American Literature", + " Hydraulic Engineering", + "Clinical Neurosciences III - Ophthalmology", + "Geotechnics", + "Social Psychology", + "Pathobiochemistry", + "Geriatric Medicine", + "Educational Psychology", + "Organisational Psychology", + "Clinical Chemistry", + "Sociological Theory", + "Gerontology", + "Rheumatology", + " Allergology", + "Dentistry", + " Construction Operation", + "Dermatology", + " Geosciences (including Geography)", + "Sructural Engineering", + " Control Systems", + "Sintered Metallic", + " Clinical Immunology", + "Roman Catholic Theology", + "Reproductive Medicine/Biology", + " Clinical Infectiology Intensive Care Medicine", + " Building Informatics", + "Geosciences (including Geography) ", + " Atmospheric Science", + "Automation", + "Veterinary Medicine ", + "Mathematical Psychology", + "Biological Process Engineering", + " Biological", + "Preparatory", + "Thermodynamics", + " Angiology", + " Robotics", + " Oral Surgery", + "Comparative Literature", + "Atmospheric Science ", + "Biomaterials", + "Law of Criminal Procedure", + "Physical Chemistry of Polymers", + "Kinetics of Materials", + "Cardiology", + " Mechatronics", + "Criminal Law", + "Cultural Studies", + "Pneumology", + "Polymer Materials", + "Ceramic Materials", + "Technical Thermodynamics", + "Theoretical Condensed Matter Physics", + "Urology", + "Electronic Semiconductors", + " Circuits", + "Traumatology", + " Chemistry", + "Theoretical Computer Science", + "Technical Chemistry", + "Thermal Process Engineering", + " Building Physics", + "Theoretical Physics of Polymers", + "Thermal Processes", + "Inter-organismic Interactions of Plants", + "Economic Theory", + " Thermomechanical Treatment of Materials", + "Cellular Neuroscience", + "Life Sciences ", + "Biological Psychiatry", + "Measurement Systems", + "Medical Research ", + "Medieval German Literature", + "Chemical", + "Metallurgical", + "Classical Philology", + "Microstructural Mechanical Properties of Materials", + "Clinical Neurosciences I - Neurology", + "History of Philosophy", + "Orthopaedics", + " Components", + " Systems", + "Comparative Neurobiology", + "Composite Materials", + " Neurosurgery", + "Construction Material Sciences", + " Life Sciences ", + "Cardiothoracic Surgery", + "Private Law", + " General Genetics", + "Experimental", + "Social Sciences ", + "Developmental Neurobiology", + " Agriculture" + ], + "y": [ + 1440, + 1325, + 1238, + 1222, + 1014, + 882, + 760, + 581, + 535, + 514, + 513, + 496, + 451, + 432, + 360, + 326, + 317, + 317, + 316, + 315, + 308, + 239, + 234, + 227, + 224, + 220, + 212, + 212, + 210, + 205, + 205, + 204, + 175, + 175, + 159, + 143, + 142, + 142, + 131, + 131, + 131, + 117, + 108, + 108, + 108, + 104, + 103, + 101, + 101, + 99, + 99, + 99, + 99, + 98, + 98, + 96, + 96, + 96, + 96, + 94, + 94, + 86, + 85, + 81, + 79, + 79, + 79, + 79, + 78, + 75, + 75, + 74, + 74, + 74, + 67, + 67, + 65, + 65, + 64, + 63, + 63, + 63, + 63, + 62, + 62, + 62, + 61, + 57, + 57, + 57, + 56, + 52, + 50, + 50, + 48, + 48, + 48, + 48, + 48, + 48, + 45, + 44, + 44, + 44, + 44, + 42, + 41, + 40, + 38, + 37, + 36, + 36, + 36, + 34, + 33, + 33, + 31, + 27, + 27, + 26, + 25, + 25, + 24, + 24, + 23, + 23, + 23, + 23, + 23, + 23, + 23, + 22, + 22, + 22, + 22, + 22, + 22, + 21, + 21, + 21, + 21, + 19, + 19, + 18, + 18, + 18, + 18, + 17, + 17, + 17, + 17, + 17, + 17, + 17, + 17, + 17, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 16, + 15, + 15, + 15, + 15, + 15, + 15, + 14, + 14, + 14, + 14, + 14, + 14, + 14, + 13, + 13, + 13, + 13, + 13, + 13, + 13, + 12, + 12, + 12, + 11, + 11, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 7, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 6, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 5, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "re3data subject coverage" + }, + "xaxis": { + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "re3data_subjects = re3data_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)\n", + "\n", + "data = [\n", + " go.Bar(\n", + " x=re3data_subjects.index,\n", + " y=re3data_subjects['url']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='re3data subject coverage',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OpenDOAR" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
010|opendoar____::e833e042f509c996b1b25324d56659fbhttp://www.bilbao.net/bldBLD - Bilboko Liburutegi DigitalaBLD - Bilboko Liburutegi DigitalaBLD is a repository of digital documents, desi...43.256699-2.924100[]
110|opendoar____::f621585df244e9596dc70a39b579efb1https://researchdirect.westernsydney.edu.au/Western Sydney ResearchDirectWestern Sydney ResearchDirectNaN0.0000000.000000[]
210|opendoar____::437d7d1d97917cd627a34a6a0fb41136http://redress.lancs.ac.uk/Learning_Space/Learning Space CatalogueNaNThis repository is a Social Science e-Science ...54.010760-2.784990['Social Sciences General', 'Science General',...
310|opendoar____::d840cc5d906c3e9c84374c8919d2074ehttp://digitallibrary.usc.edu/search/controlle...USC Digital LibraryUSC Digital LibraryThis is an institutional repository providing ...34.052200-118.242996[]
510|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010http://www.ufgd.edu.br:8080/jspui/Repositório de Divulgação das Produções Cientí...Repositório de Divulgação das Produções Cientí...This site provides access to the research outp...-22.221800-54.806400[]
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 10|opendoar____::e833e042f509c996b1b25324d56659fb \n", + "1 10|opendoar____::f621585df244e9596dc70a39b579efb1 \n", + "2 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 \n", + "3 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e \n", + "5 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 \n", + "\n", + " url \\\n", + "0 http://www.bilbao.net/bld \n", + "1 https://researchdirect.westernsydney.edu.au/ \n", + "2 http://redress.lancs.ac.uk/Learning_Space/ \n", + "3 http://digitallibrary.usc.edu/search/controlle... \n", + "5 http://www.ufgd.edu.br:8080/jspui/ \n", + "\n", + " official_name \\\n", + "0 BLD - Bilboko Liburutegi Digitala \n", + "1 Western Sydney ResearchDirect \n", + "2 Learning Space Catalogue \n", + "3 USC Digital Library \n", + "5 Repositório de Divulgação das Produções Cientí... \n", + "\n", + " english_name \\\n", + "0 BLD - Bilboko Liburutegi Digitala \n", + "1 Western Sydney ResearchDirect \n", + "2 NaN \n", + "3 USC Digital Library \n", + "5 Repositório de Divulgação das Produções Cientí... \n", + "\n", + " description latitude longitude \\\n", + "0 BLD is a repository of digital documents, desi... 43.256699 -2.924100 \n", + "1 NaN 0.000000 0.000000 \n", + "2 This repository is a Social Science e-Science ... 54.010760 -2.784990 \n", + "3 This is an institutional repository providing ... 34.052200 -118.242996 \n", + "5 This site provides access to the research outp... -22.221800 -54.806400 \n", + "\n", + " subjects \n", + "0 [] \n", + "1 [] \n", + "2 ['Social Sciences General', 'Science General',... \n", + "3 [] \n", + "5 [] " + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n", + "opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')]\n", + "opendoar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 []\n", + "1 []\n", + "2 ['Social Sciences General', 'Science General',...\n", + "3 []\n", + "5 []\n", + " ... \n", + "8701 ['Multidisciplinary']\n", + "8702 []\n", + "8703 ['Business and Economics']\n", + "8704 ['Earth and Planetary Sciences', 'Ecology and ...\n", + "8706 []\n", + "Name: subjects, Length: 6014, dtype: object" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.subjects" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n", + " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n", + " .reset_index()\\\n", + " .groupby('index')[0].apply(lambda x: merge_lists(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "index\n", + "0 []\n", + "1 []\n", + "2 [Social Sciences General, Science General, Com...\n", + "3 []\n", + "5 []\n", + " ... \n", + "8701 [Multidisciplinary]\n", + "8702 []\n", + "8703 [Business, Economics]\n", + "8704 [Earth, Planetary Sciences, Ecology, Environme...\n", + "8706 []\n", + "Name: 0, Length: 6014, dtype: object" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_cleaned_subjects" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_df.drop(columns=['subjects'], inplace=True)\n", + "opendoar_df.rename(columns={0: 'subjects'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
count601460136014550057766014.0000006014.0000006014
unique60145953594654134920NaNNaN201
top10|opendoar____::a2557a7b2e94197ff767970b67041697http://harp.lib.hiroshima-u.ac.jp/Hiroshima Associated Repository PortalAURAThis site provides access to the research outp...NaNNaN[]
freq133498NaNNaN5273
meanNaNNaNNaNNaNNaN38.6493937.810948NaN
stdNaNNaNNaNNaNNaN788.40617371.689788NaN
minNaNNaNNaNNaNNaN-79.029999-683.103027NaN
25%NaNNaNNaNNaNNaN4.644632-49.273300NaN
50%NaNNaNNaNNaNNaN37.9304494.788870NaN
75%NaNNaNNaNNaNNaN47.29440030.685501NaN
maxNaNNaNNaNNaNNaN61138.800781178.438995NaN
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "count 6014 \n", + "unique 6014 \n", + "top 10|opendoar____::a2557a7b2e94197ff767970b67041697 \n", + "freq 1 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " url \\\n", + "count 6013 \n", + "unique 5953 \n", + "top http://harp.lib.hiroshima-u.ac.jp/ \n", + "freq 3 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " official_name english_name \\\n", + "count 6014 5500 \n", + "unique 5946 5413 \n", + "top Hiroshima Associated Repository Portal AURA \n", + "freq 3 4 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " description latitude \\\n", + "count 5776 6014.000000 \n", + "unique 4920 NaN \n", + "top This site provides access to the research outp... NaN \n", + "freq 98 NaN \n", + "mean NaN 38.649393 \n", + "std NaN 788.406173 \n", + "min NaN -79.029999 \n", + "25% NaN 4.644632 \n", + "50% NaN 37.930449 \n", + "75% NaN 47.294400 \n", + "max NaN 61138.800781 \n", + "\n", + " longitude subjects \n", + "count 6014.000000 6014 \n", + "unique NaN 201 \n", + "top NaN [] \n", + "freq NaN 5273 \n", + "mean 7.810948 NaN \n", + "std 71.689788 NaN \n", + "min -683.103027 NaN \n", + "25% -49.273300 NaN \n", + "50% 4.788870 NaN \n", + "75% 30.685501 NaN \n", + "max 178.438995 NaN " + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.describe(include='all')" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "Multidisciplinary", + "Medicine", + "Health", + "Science General", + "Technology General", + "Economics", + "Business", + "Archaeology", + "Social Sciences General", + "History", + "Politics", + "Environment", + "Ecology", + "Law", + "Computers", + "IT", + "Biology", + "Biochemistry", + "Information Science", + "Library", + "Humanities General", + "Arts", + "Education", + " Food", + "Agriculture", + "Statistics", + "Mathematics", + "Literature", + "Veterinary", + "Astronomy", + "Physics", + "Geography", + "Regional Studies", + "Language", + "Religion", + "Chemical Technology", + "Chemistry", + "Philosophy", + "Fine", + "Performing Arts", + "Planning", + "Psychology", + "Management", + "Planetary Sciences", + "Earth", + "Electrical", + "Electronic Engineering", + "Architecture", + "Civil Engineering", + "Mechanical Engineering", + "Materials", + " History", + " Philosophy", + " Health", + "Social Sciences General ", + " Language", + " Technology General", + " Law", + "Performing Arts ", + " Science General", + "Medicine ", + "IT ", + "Veterinary " + ], + "y": [ + 466, + 67, + 66, + 63, + 53, + 52, + 52, + 49, + 48, + 47, + 44, + 44, + 44, + 43, + 43, + 42, + 40, + 40, + 36, + 36, + 35, + 35, + 32, + 31, + 31, + 30, + 30, + 30, + 30, + 29, + 29, + 29, + 29, + 29, + 27, + 27, + 27, + 25, + 23, + 22, + 17, + 17, + 17, + 16, + 16, + 12, + 12, + 12, + 8, + 7, + 7, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "OpenDOAR subject coverage" + }, + "xaxis": { + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "opendoar_subjects = opendoar_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)\n", + "\n", + "data = [\n", + " go.Bar(\n", + " x=opendoar_subjects.index,\n", + " y=opendoar_subjects['url']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='OpenDOAR subject coverage',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
latlonnameadmin1admin2cc
043.26271-2.92528BilbaoBasque CountryBizkaiaES
14.88447-1.75536TakoradiWesternGH
253.98333-2.78333GalgateEnglandLancashireGB
334.05223-118.24368Los AngelesCaliforniaLos Angeles CountyUS
4-22.22111-54.80556DouradosMato Grosso do SulDouradosBR
.....................
600940.8563114.24641NapoliCampaniaProvincia di NapoliIT
601038.1939415.55256MessinaSicilyMessinaIT
601154.3213310.13489KielSchleswig-HolsteinDE
601243.40785-73.25955GranvilleNew YorkWashington CountyUS
601333.96095-83.37794AthensGeorgiaClarke CountyUS
\n", + "

6014 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " lat lon name admin1 \\\n", + "0 43.26271 -2.92528 Bilbao Basque Country \n", + "1 4.88447 -1.75536 Takoradi Western \n", + "2 53.98333 -2.78333 Galgate England \n", + "3 34.05223 -118.24368 Los Angeles California \n", + "4 -22.22111 -54.80556 Dourados Mato Grosso do Sul \n", + "... ... ... ... ... \n", + "6009 40.85631 14.24641 Napoli Campania \n", + "6010 38.19394 15.55256 Messina Sicily \n", + "6011 54.32133 10.13489 Kiel Schleswig-Holstein \n", + "6012 43.40785 -73.25955 Granville New York \n", + "6013 33.96095 -83.37794 Athens Georgia \n", + "\n", + " admin2 cc \n", + "0 Bizkaia ES \n", + "1 GH \n", + "2 Lancashire GB \n", + "3 Los Angeles County US \n", + "4 Dourados BR \n", + "... ... .. \n", + "6009 Provincia di Napoli IT \n", + "6010 Messina IT \n", + "6011 DE \n", + "6012 Washington County US \n", + "6013 Clarke County US \n", + "\n", + "[6014 rows x 6 columns]" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reverse_geocoding = pd.DataFrame(rg.search(opendoar_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))\n", + "reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')\n", + "reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')\n", + "reverse_geocoding" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_df = opendoar_df.join(reverse_geocoding[['cc']])" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_df.loc[(opendoar_df.latitude == 0.0) & (opendoar_df.longitude == 0.0), ['latitude', 'longitude', 'cc']] = [np.nan, np.nan, np.nan]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "US", + "GH", + "JP", + "GB", + "DE", + "ES", + "BR", + "PE", + "TR", + "FR", + "HR", + "ID", + "IT", + "PL", + "CA", + "AU", + "IN", + "UA", + "NO", + "CO", + "NL", + "AR", + "CN", + "PT", + "TW", + "CH", + "SE", + "GR", + "ZA", + "MX", + "AT", + "HU", + "BY", + "BE", + "RS", + "KE", + "EC", + "FI", + "NG", + "RU", + "IE", + "KR", + "MY", + "CZ", + "IR", + "VE", + "CL", + "EG", + "LT", + "BD", + "LK", + "DK", + "NZ", + "TZ", + "UG", + "DZ", + "SA", + "NI", + "MD", + "SD", + "ZW", + "SI", + "CU", + "KZ", + "HK", + "TH", + "JM", + "EE", + "SV", + "UY", + "MK", + "PH", + "PS", + "BW", + "PK", + "BO", + "SN", + "DO", + "LB", + "LV", + "FJ", + "NA", + "SG", + "BG", + "YE", + "SJ", + "LY", + "RO", + "PY", + "MN", + "CR", + "IL", + "TN", + "PA", + "MZ", + "CY", + "TT", + "XK", + "VA", + "VN", + "SY", + "ZM", + "AE", + "RW", + "IS", + "AM", + "AZ", + "BN", + "CM", + "CV", + "ET", + "HN", + "IQ", + "KG", + "PR", + "LA", + "AL", + "LU", + "MO", + "MV", + "MW", + "NC", + "NP", + "LS" + ], + "y": [ + 541, + 410, + 345, + 191, + 174, + 104, + 100, + 83, + 82, + 79, + 79, + 77, + 76, + 69, + 67, + 64, + 63, + 61, + 58, + 54, + 47, + 43, + 38, + 37, + 36, + 36, + 35, + 29, + 27, + 27, + 26, + 26, + 25, + 25, + 22, + 20, + 20, + 20, + 20, + 18, + 18, + 17, + 16, + 14, + 13, + 12, + 11, + 10, + 10, + 9, + 9, + 9, + 9, + 8, + 8, + 8, + 8, + 8, + 7, + 7, + 7, + 6, + 6, + 6, + 6, + 6, + 6, + 5, + 5, + 4, + 4, + 4, + 4, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "OpenDOAR country coverage" + }, + "xaxis": { + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "opendoar_countries = opendoar_df.groupby('cc')[['id']].count().sort_values('id', ascending=False)\n", + "\n", + "data = [\n", + " go.Bar(\n", + " x=opendoar_countries.index,\n", + " y=opendoar_countries['id']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='OpenDOAR country coverage',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/references/.gitkeep b/references/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reports/.gitkeep b/reports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reports/figures/.gitkeep b/reports/figures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d4f7d11 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# local package +-e . + +# external requirements +click +Sphinx +coverage +awscli +flake8 +python-dotenv>=0.5.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..570ed7b --- /dev/null +++ b/setup.py @@ -0,0 +1,10 @@ +from setuptools import find_packages, setup + +setup( + name='src', + packages=find_packages(), + version='0.1.0', + description='A short description of the project.', + author='Andrea Mannocci', + license='MIT', +) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/.gitkeep b/src/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py new file mode 100644 index 0000000..1556beb --- /dev/null +++ b/src/data/make_dataset.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +import csv +import os +import json +import click +import logging +from pathlib import Path +from dotenv import find_dotenv, load_dotenv + + +def get_value_or_none(obj, key): + if key in obj: + return obj[key]['value'] + else: + return None + +@click.command() +@click.argument('input_filepath', type=click.Path(exists=True)) +@click.argument('output_filepath', type=click.Path()) +def main(input_filepath, output_filepath): + """ Runs data processing scripts to turn raw data from (../raw) into + cleaned data ready to be analyzed (saved in ../processed). + """ + logger = logging.getLogger(__name__) + logger.info('making final data set from raw data') + + with open(os.path.join(input_filepath, 'OpenAIRE_DS_re3data_opendoar.json'), mode='r') as f: + with open(os.path.join(output_filepath, 're3data_opendoar.csv'), mode='w') as csvfile: + csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + csv_writer.writerow(['id', 'url', 'official_name', 'english_name', 'description', 'latitude', 'longitude', 'subjects']) + + for line in f: + repo = json.loads(line) + identifier = repo['id'] + official_name = repo['officialname']['value'] + url = get_value_or_none(repo, 'websiteurl') + english_name = get_value_or_none(repo, 'englishname') + description = get_value_or_none(repo, 'description') + latitude = get_value_or_none(repo, 'latitude') + longitude = get_value_or_none(repo, 'longitude') + + subjects = [] + for s in repo['subjects']: + subjects.append(s['value']) + + csv_writer.writerow([identifier, url, official_name, english_name, description, latitude, longitude, subjects]) + + +if __name__ == '__main__': + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + + # not used in this stub but often useful for finding various files + project_dir = Path(__file__).resolve().parents[2] + + # find .env automagically by walking up directories until it's found, then + # load up the .env entries as environment variables + load_dotenv(find_dotenv()) + + main() diff --git a/src/features/.gitkeep b/src/features/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/features/__init__.py b/src/features/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/features/build_features.py b/src/features/build_features.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/.gitkeep b/src/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/predict_model.py b/src/models/predict_model.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/train_model.py b/src/models/train_model.py new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/.gitkeep b/src/visualization/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/__init__.py b/src/visualization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/visualize.py b/src/visualization/visualize.py new file mode 100644 index 0000000..e69de29 diff --git a/test_environment.py b/test_environment.py new file mode 100644 index 0000000..d0ac4a7 --- /dev/null +++ b/test_environment.py @@ -0,0 +1,25 @@ +import sys + +REQUIRED_PYTHON = "python3" + + +def main(): + system_major = sys.version_info.major + if REQUIRED_PYTHON == "python": + required_major = 2 + elif REQUIRED_PYTHON == "python3": + required_major = 3 + else: + raise ValueError("Unrecognized python interpreter: {}".format( + REQUIRED_PYTHON)) + + if system_major != required_major: + raise TypeError( + "This project requires Python {}. Found: Python {}".format( + required_major, sys.version)) + else: + print(">>> Development environment passes all tests!") + + +if __name__ == '__main__': + main() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..c32fbd8 --- /dev/null +++ b/tox.ini @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 79 +max-complexity = 10