From 9c2a1bb846085049759f24137b0634ce0f598f27 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Thu, 18 Mar 2021 17:43:00 +0100 Subject: [PATCH] first commit --- .gitignore | 89 + LICENSE | 10 + Makefile | 144 + README.md | 57 + docs/Makefile | 153 + docs/commands.rst | 10 + docs/conf.py | 244 + docs/getting-started.rst | 6 + docs/index.rst | 24 + docs/make.bat | 190 + models/.gitkeep | 0 notebooks/.gitkeep | 0 notebooks/01-Exploration.ipynb | 9296 +++++++++++++++++++++++++ notebooks/02-Spam filter.ipynb | 1096 +++ notebooks/03-Feature extraction.ipynb | 2422 +++++++ references/.gitkeep | 0 reports/.gitkeep | 0 reports/figures/.gitkeep | 0 requirements.txt | 10 + setup.py | 10 + src/__init__.py | 0 src/data/.gitkeep | 0 src/data/__init__.py | 0 src/data/make_dataset.py | 30 + src/features/.gitkeep | 0 src/features/__init__.py | 0 src/features/build_features.py | 0 src/models/.gitkeep | 0 src/models/__init__.py | 0 src/models/predict_model.py | 0 src/models/train_model.py | 0 src/visualization/.gitkeep | 0 src/visualization/__init__.py | 0 src/visualization/visualize.py | 0 test_environment.py | 25 + tox.ini | 3 + 36 files changed, 13819 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100644 docs/Makefile create mode 100644 docs/commands.rst create mode 100644 docs/conf.py create mode 100644 docs/getting-started.rst create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 models/.gitkeep create mode 100644 notebooks/.gitkeep create mode 100644 notebooks/01-Exploration.ipynb create mode 100644 notebooks/02-Spam filter.ipynb create mode 100644 notebooks/03-Feature extraction.ipynb create mode 100644 references/.gitkeep create mode 100644 reports/.gitkeep create mode 100644 reports/figures/.gitkeep create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 src/__init__.py create mode 100644 src/data/.gitkeep create mode 100644 src/data/__init__.py create mode 100644 src/data/make_dataset.py create mode 100644 src/features/.gitkeep create mode 100644 src/features/__init__.py create mode 100644 src/features/build_features.py create mode 100644 src/models/.gitkeep create mode 100644 src/models/__init__.py create mode 100644 src/models/predict_model.py create mode 100644 src/models/train_model.py create mode 100644 src/visualization/.gitkeep create mode 100644 src/visualization/__init__.py create mode 100644 src/visualization/visualize.py create mode 100644 test_environment.py create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3439057 --- /dev/null +++ b/.gitignore @@ -0,0 +1,89 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# DotEnv configuration +.env + +# Database +*.db +*.rdb + +# Pycharm +.idea + +# VS Code +.vscode/ + +# Spyder +.spyproject/ + +# Jupyter NB Checkpoints +.ipynb_checkpoints/ + +# exclude data from source control by default +/data + +# Mac OS-specific storage files +.DS_Store + +# vim +*.swp +*.swo + +# Mypy cache +.mypy_cache/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..453f5be --- /dev/null +++ b/LICENSE @@ -0,0 +1,10 @@ + +The MIT License (MIT) +Copyright (c) 2021, Andrea Mannocci + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..227fcf0 --- /dev/null +++ b/Makefile @@ -0,0 +1,144 @@ +.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3 + +################################################################################# +# GLOBALS # +################################################################################# + +PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') +PROFILE = default +PROJECT_NAME = orcid-classifier +PYTHON_INTERPRETER = python3 + +ifeq (,$(shell which conda)) +HAS_CONDA=False +else +HAS_CONDA=True +endif + +################################################################################# +# COMMANDS # +################################################################################# + +## Install Python Dependencies +requirements: test_environment + $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel + $(PYTHON_INTERPRETER) -m pip install -r requirements.txt + +## Make Dataset +data: requirements + $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed + +## Delete all compiled Python files +clean: + find . -type f -name "*.py[co]" -delete + find . -type d -name "__pycache__" -delete + +## Lint using flake8 +lint: + flake8 src + +## Upload Data to S3 +sync_data_to_s3: +ifeq (default,$(PROFILE)) + aws s3 sync data/ s3://$(BUCKET)/data/ +else + aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE) +endif + +## Download Data from S3 +sync_data_from_s3: +ifeq (default,$(PROFILE)) + aws s3 sync s3://$(BUCKET)/data/ data/ +else + aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE) +endif + +## Set up python interpreter environment +create_environment: +ifeq (True,$(HAS_CONDA)) + @echo ">>> Detected conda, creating conda environment." +ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) + conda create --name $(PROJECT_NAME) python=3 +else + conda create --name $(PROJECT_NAME) python=2.7 +endif + @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" +else + $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper + @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ + export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" + @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" + @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" +endif + +## Test python environment is setup correctly +test_environment: + $(PYTHON_INTERPRETER) test_environment.py + +################################################################################# +# PROJECT RULES # +################################################################################# + + + +################################################################################# +# Self Documenting Commands # +################################################################################# + +.DEFAULT_GOAL := help + +# Inspired by +# sed script explained: +# /^##/: +# * save line in hold space +# * purge line +# * Loop: +# * append newline + line to hold space +# * go to next line +# * if line starts with doc comment, strip comment character off and loop +# * remove target prerequisites +# * append hold space (+ newline) to line +# * replace newline plus comments by `---` +# * print line +# Separate expressions are necessary because labels cannot be delimited by +# semicolon; see +.PHONY: help +help: + @echo "$$(tput bold)Available rules:$$(tput sgr0)" + @echo + @sed -n -e "/^## / { \ + h; \ + s/.*//; \ + :doc" \ + -e "H; \ + n; \ + s/^## //; \ + t doc" \ + -e "s/:.*//; \ + G; \ + s/\\n## /---/; \ + s/\\n/ /g; \ + p; \ + }" ${MAKEFILE_LIST} \ + | LC_ALL='C' sort --ignore-case \ + | awk -F '---' \ + -v ncol=$$(tput cols) \ + -v indent=19 \ + -v col_on="$$(tput setaf 6)" \ + -v col_off="$$(tput sgr0)" \ + '{ \ + printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ + n = split($$2, words, " "); \ + line_length = ncol - indent; \ + for (i = 1; i <= n; i++) { \ + line_length -= length(words[i]) + 1; \ + if (line_length <= 0) { \ + line_length = ncol - indent - length(words[i]) - 1; \ + printf "\n%*s ", -indent, " "; \ + } \ + printf "%s ", words[i]; \ + } \ + printf "\n"; \ + }' \ + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/README.md b/README.md new file mode 100644 index 0000000..45d278e --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +orcid-classifier +============================== + +A short description of the project. + +Project Organization +------------ + + ├── LICENSE + ├── Makefile <- Makefile with commands like `make data` or `make train` + ├── README.md <- The top-level README for developers using this project. + ├── data + │   ├── external <- Data from third party sources. + │   ├── interim <- Intermediate data that has been transformed. + │   ├── processed <- The final, canonical data sets for modeling. + │   └── raw <- The original, immutable data dump. + │ + ├── docs <- A default Sphinx project; see sphinx-doc.org for details + │ + ├── models <- Trained and serialized models, model predictions, or model summaries + │ + ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), + │ the creator's initials, and a short `-` delimited description, e.g. + │ `1.0-jqp-initial-data-exploration`. + │ + ├── references <- Data dictionaries, manuals, and all other explanatory materials. + │ + ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. + │   └── figures <- Generated graphics and figures to be used in reporting + │ + ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g. + │ generated with `pip freeze > requirements.txt` + │ + ├── setup.py <- makes project pip installable (pip install -e .) so src can be imported + ├── src <- Source code for use in this project. + │   ├── __init__.py <- Makes src a Python module + │ │ + │   ├── data <- Scripts to download or generate data + │   │   └── make_dataset.py + │ │ + │   ├── features <- Scripts to turn raw data into features for modeling + │   │   └── build_features.py + │ │ + │   ├── models <- Scripts to train models and then use trained models to make + │ │ │ predictions + │   │   ├── predict_model.py + │   │   └── train_model.py + │ │ + │   └── visualization <- Scripts to create exploratory and results oriented visualizations + │   └── visualize.py + │ + └── tox.ini <- tox file with settings for running tox; see tox.readthedocs.io + + +-------- + +

Project based on the cookiecutter data science project template. #cookiecutterdatascience

diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..386edbb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,153 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/orcid-classifier.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/orcid-classifier.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/orcid-classifier" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/orcid-classifier" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/docs/commands.rst b/docs/commands.rst new file mode 100644 index 0000000..2d162f3 --- /dev/null +++ b/docs/commands.rst @@ -0,0 +1,10 @@ +Commands +======== + +The Makefile contains the central entry points for common tasks related to this project. + +Syncing data to S3 +^^^^^^^^^^^^^^^^^^ + +* `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`. +* `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..2943c79 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- +# +# orcid-classifier documentation build configuration file, created by +# sphinx-quickstart. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import os +import sys + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'orcid-classifier' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'orcid-classifierdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', + 'orcid-classifier.tex', + u'orcid-classifier Documentation', + u"Andrea Mannocci", 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# If true, show page references after internal links. +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'orcid-classifier', u'orcid-classifier Documentation', + [u"Andrea Mannocci"], 1) +] + +# If true, show URL addresses after external links. +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'orcid-classifier', u'orcid-classifier Documentation', + u"Andrea Mannocci", 'orcid-classifier', + 'A short description of the project.', 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# texinfo_appendices = [] + +# If false, no module index is generated. +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# texinfo_show_urls = 'footnote' diff --git a/docs/getting-started.rst b/docs/getting-started.rst new file mode 100644 index 0000000..b4f71c3 --- /dev/null +++ b/docs/getting-started.rst @@ -0,0 +1,6 @@ +Getting started +=============== + +This is where you describe how to get set up on a clean install, including the +commands necessary to get the raw data (using the `sync_data_from_s3` command, +for example), and then how to make the cleaned, final data sets. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..584423a --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,24 @@ +.. orcid-classifier documentation master file, created by + sphinx-quickstart. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +orcid-classifier documentation! +============================================== + +Contents: + +.. toctree:: + :maxdepth: 2 + + getting-started + commands + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..5879b1a --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,190 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\orcid-classifier.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\orcid-classifier.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +:end diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb new file mode 100644 index 0000000..a57ac02 --- /dev/null +++ b/notebooks/01-Exploration.ipynb @@ -0,0 +1,9296 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Explorative analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TODO:\n", + "- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)\n", + "- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)\n", + "- Temporal dimension of any use?\n", + "- Can we access private info thanks to the OpenAIRE-ORCID agreement?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import ast\n", + "import tldextract\n", + "import numpy\n", + "\n", + "import plotly\n", + "from plotly.offline import iplot, init_notebook_mode\n", + "import plotly.graph_objs as go\n", + "\n", + "init_notebook_mode(connected=True)\n", + "TOP_N = 30\n", + "TOP_RANGE = [-.5, TOP_N - 1 + .5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notable solid ORCID iDs for explorative purposes:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "AM = '0000-0002-5193-7851'\n", + "PP = '0000-0002-8588-4196'\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anomalies ORCiD profile" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "JOURNAL = '0000-0003-1815-5732'\n", + "NOINFO= '0000-0001-5009-2052'\n", + "# find group-shared ORCiD" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notable fake ORCID iDs for explorative purposes:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "SCAFFOLD = '0000-0001-5004-7761'\n", + "WHATSAPP = '0000-0001-6997-9470'\n", + "PENIS = '0000-0002-3399-7287'\n", + "BITCOIN = '0000-0002-7518-6845'\n", + "FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment\n", + "CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)\n", + "PLUMBER = '0000-0002-1700-8311' # URL > 10 + works " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n", + " names = ['orcid', 'claimed','verified_email', 'verified_primary_email', \n", + " 'given_names', 'family_name', 'biography', 'other_names', 'urls', \n", + " 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', \n", + " 'employment', 'n_works', 'works_source'])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
75520000-0001-7831-7567111VahabVahdatNaNNaNNaNNaNNaNNaN[[\"Scopus Author ID\", \"57193490305\"], [\"Scopus...[[\"Industrial Engineering\", \"PhD\", \"Northeaste...[[\"Post-doctorate fellow\", \"Harvard Medical Sc...25[\"Vahab Vahdat\", \"Scopus - Elsevier\", \"Multidi...
84160000-0001-8161-1345111AYFERTEKIN ATACANNaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN
164980000-0002-1133-1505111XianrongLaiNaNNaNNaNNaNNaNNaN[[\"Scopus Author ID\", \"15769435500\"]][[\"Department of pharmacy\", \"Bachelor of Tradi...[[\"Associate Research, Professor\", \"Chengdu Un...115[\"Xianrong Lai\", \"Scopus - Elsevier\", \"Crossref\"]
168300000-0002-1257-5536111AlexandraZimmerNaNNaNNaNNaNNaNNaNNaNNaN[[\"Research assistent\", \"Fraunhofer-Institut f...0NaN
188350000-0002-2026-4156111FatmaSri WahyuniNaN[\"Ayu\"]NaNNaNNaNNaN[[\"ResearcherID\", \"C-5194-2015\"], [\"Scopus Aut...[[\"Biosains\", \"PHD\", \"Universiti Putra Malaysi...[[\"Lecturer\", \"Universitas Andalas\", \"Padang\",...27[\"Publons\", \"Crossref Metadata Search\", \"Scopu...
......................................................
107332930000-0002-9887-7788111MarkétaLaštůvkováNaNNaNNaNNaNNaNNaNNaNNaN[[\"\", \"VSB - Technical University of Ostrava\",...0NaN
107372580000-0003-1367-8104111LORENAGUTIÉRREZ GARCÍANaNNaN[[\"LinkedIn\", \"https://www.linkedin.com/in/lor...lorenagg@unex.esNaN[\"Agroecolog\\u00eda, Bot\\u00e1nica, Did\\u00e1c...[[\"ResearcherID\", \"AAE-6316-2021\"]][[\"\", \"M\\u00e1ster en Formaci\\u00f3n del profe...[[\"PCI\", \"Universidad de Extremadura - Campus ...14[\"Multidisciplinary Digital Publishing Institu...
107383080000-0003-1741-3437111XingLiuNaNNaNNaNNaNNaNNaN[[\"ResearcherID\", \"S-3053-2017\"]]NaNNaN0NaN
107414600000-0003-2909-8585111YusufÖzcanNaNNaNNaNNaNNaNNaNNaN[[\"\\u0130lahiyat Fak\\u00fcltesi\", \"Doktora\", \"...[[\"Research Assistant\", \"\\u00c7ukurova Univers...0NaN
107450780000-0003-4259-5324111P Rama MohanNaNNaNNaNNaNNaNNaNNaN[[\"Scopus Author ID\", \"24776757000\"]][[\"EEE Department\", \"Ph.D. (Power Electronics ...[[\"Associate Professor\", \"RGM College of Engin...21[\"Scopus - Elsevier\", \"P Rama Mohan\"]
\n", + "

2418 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email \\\n", + "7552 0000-0001-7831-7567 1 1 \n", + "8416 0000-0001-8161-1345 1 1 \n", + "16498 0000-0002-1133-1505 1 1 \n", + "16830 0000-0002-1257-5536 1 1 \n", + "18835 0000-0002-2026-4156 1 1 \n", + "... ... ... ... \n", + "10733293 0000-0002-9887-7788 1 1 \n", + "10737258 0000-0003-1367-8104 1 1 \n", + "10738308 0000-0003-1741-3437 1 1 \n", + "10741460 0000-0003-2909-8585 1 1 \n", + "10745078 0000-0003-4259-5324 1 1 \n", + "\n", + " verified_primary_email given_names family_name biography \\\n", + "7552 1 Vahab Vahdat NaN \n", + "8416 1 AYFER TEKIN ATACAN NaN \n", + "16498 1 Xianrong Lai NaN \n", + "16830 1 Alexandra Zimmer NaN \n", + "18835 1 Fatma Sri Wahyuni NaN \n", + "... ... ... ... ... \n", + "10733293 1 Markéta Laštůvková NaN \n", + "10737258 1 LORENA GUTIÉRREZ GARCÍA NaN \n", + "10738308 1 Xing Liu NaN \n", + "10741460 1 Yusuf Özcan NaN \n", + "10745078 1 P Rama Mohan NaN NaN \n", + "\n", + " other_names urls \\\n", + "7552 NaN NaN \n", + "8416 NaN NaN \n", + "16498 NaN NaN \n", + "16830 NaN NaN \n", + "18835 [\"Ayu\"] NaN \n", + "... ... ... \n", + "10733293 NaN NaN \n", + "10737258 NaN [[\"LinkedIn\", \"https://www.linkedin.com/in/lor... \n", + "10738308 NaN NaN \n", + "10741460 NaN NaN \n", + "10745078 NaN NaN \n", + "\n", + " primary_email other_emails \\\n", + "7552 NaN NaN \n", + "8416 NaN NaN \n", + "16498 NaN NaN \n", + "16830 NaN NaN \n", + "18835 NaN NaN \n", + "... ... ... \n", + "10733293 NaN NaN \n", + "10737258 lorenagg@unex.es NaN \n", + "10738308 NaN NaN \n", + "10741460 NaN NaN \n", + "10745078 NaN NaN \n", + "\n", + " keywords \\\n", + "7552 NaN \n", + "8416 NaN \n", + "16498 NaN \n", + "16830 NaN \n", + "18835 NaN \n", + "... ... \n", + "10733293 NaN \n", + "10737258 [\"Agroecolog\\u00eda, Bot\\u00e1nica, Did\\u00e1c... \n", + "10738308 NaN \n", + "10741460 NaN \n", + "10745078 NaN \n", + "\n", + " external_ids \\\n", + "7552 [[\"Scopus Author ID\", \"57193490305\"], [\"Scopus... \n", + "8416 NaN \n", + "16498 [[\"Scopus Author ID\", \"15769435500\"]] \n", + "16830 NaN \n", + "18835 [[\"ResearcherID\", \"C-5194-2015\"], [\"Scopus Aut... \n", + "... ... \n", + "10733293 NaN \n", + "10737258 [[\"ResearcherID\", \"AAE-6316-2021\"]] \n", + "10738308 [[\"ResearcherID\", \"S-3053-2017\"]] \n", + "10741460 NaN \n", + "10745078 [[\"Scopus Author ID\", \"24776757000\"]] \n", + "\n", + " education \\\n", + "7552 [[\"Industrial Engineering\", \"PhD\", \"Northeaste... \n", + "8416 NaN \n", + "16498 [[\"Department of pharmacy\", \"Bachelor of Tradi... \n", + "16830 NaN \n", + "18835 [[\"Biosains\", \"PHD\", \"Universiti Putra Malaysi... \n", + "... ... \n", + "10733293 NaN \n", + "10737258 [[\"\", \"M\\u00e1ster en Formaci\\u00f3n del profe... \n", + "10738308 NaN \n", + "10741460 [[\"\\u0130lahiyat Fak\\u00fcltesi\", \"Doktora\", \"... \n", + "10745078 [[\"EEE Department\", \"Ph.D. (Power Electronics ... \n", + "\n", + " employment n_works \\\n", + "7552 [[\"Post-doctorate fellow\", \"Harvard Medical Sc... 25 \n", + "8416 NaN 0 \n", + "16498 [[\"Associate Research, Professor\", \"Chengdu Un... 115 \n", + "16830 [[\"Research assistent\", \"Fraunhofer-Institut f... 0 \n", + "18835 [[\"Lecturer\", \"Universitas Andalas\", \"Padang\",... 27 \n", + "... ... ... \n", + "10733293 [[\"\", \"VSB - Technical University of Ostrava\",... 0 \n", + "10737258 [[\"PCI\", \"Universidad de Extremadura - Campus ... 14 \n", + "10738308 NaN 0 \n", + "10741460 [[\"Research Assistant\", \"\\u00c7ukurova Univers... 0 \n", + "10745078 [[\"Associate Professor\", \"RGM College of Engin... 21 \n", + "\n", + " works_source \n", + "7552 [\"Vahab Vahdat\", \"Scopus - Elsevier\", \"Multidi... \n", + "8416 NaN \n", + "16498 [\"Xianrong Lai\", \"Scopus - Elsevier\", \"Crossref\"] \n", + "16830 NaN \n", + "18835 [\"Publons\", \"Crossref Metadata Search\", \"Scopu... \n", + "... ... \n", + "10733293 NaN \n", + "10737258 [\"Multidisciplinary Digital Publishing Institu... \n", + "10738308 NaN \n", + "10741460 NaN \n", + "10745078 [\"Scopus - Elsevier\", \"P Rama Mohan\"] \n", + "\n", + "[2418 rows x 17 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.duplicated()]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop_duplicates(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Basic column manipulation (interpret columns as lists when necessary)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.other_names.notna(), 'other_names'] = df.loc[df.other_names.notna(), 'other_names'].apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.keywords.notna(), 'keywords'] = df.loc[df.keywords.notna(), 'keywords'].apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.urls.notna(), 'urls'] = df.loc[df.urls.notna(), 'urls'].apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.other_emails.notna(), 'other_emails'] = df.loc[df.other_emails.notna(), 'other_emails'].apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.education.notna(), 'education'] = df.loc[df.education.notna(), 'education'].apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.employment.notna(), 'employment'] = df.loc[df.employment.notna(), 'employment'].apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.external_ids.notna(), 'external_ids'] = df.loc[df.external_ids.notna(), 'external_ids'].apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.works_source.notna(), 'works_source'] = df.loc[df.works_source.notna(), 'works_source'].apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
00000-0001-5000-2053100JorgeJaramillo SanchezNaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN
10000-0001-5000-6548100WisemanBekelesiNaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN
20000-0001-5000-7962111ALICEINDIMULINaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN
30000-0001-5000-8586100shimji yunNaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN
40000-0001-5001-0256100SandroCaramaschiNaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "0 0000-0001-5000-2053 1 0 0 \n", + "1 0000-0001-5000-6548 1 0 0 \n", + "2 0000-0001-5000-7962 1 1 1 \n", + "3 0000-0001-5000-8586 1 0 0 \n", + "4 0000-0001-5001-0256 1 0 0 \n", + "\n", + " given_names family_name biography other_names urls primary_email \\\n", + "0 Jorge Jaramillo Sanchez NaN NaN NaN NaN \n", + "1 Wiseman Bekelesi NaN NaN NaN NaN \n", + "2 ALICE INDIMULI NaN NaN NaN NaN \n", + "3 shim ji yun NaN NaN NaN NaN \n", + "4 Sandro Caramaschi NaN NaN NaN NaN \n", + "\n", + " other_emails keywords external_ids education employment n_works \\\n", + "0 NaN NaN NaN NaN NaN 0 \n", + "1 NaN NaN NaN NaN NaN 0 \n", + "2 NaN NaN NaN NaN NaN 0 \n", + "3 NaN NaN NaN NaN NaN 0 \n", + "4 NaN NaN NaN NaN NaN 0 \n", + "\n", + " works_source \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
88404130000-0002-5193-7851111AndreaMannocciNaNNaN[[Personal website, https://andremann.github.i...andrea.mannocci@isti.cnr.itNaN[Data science , science of science, scholarly ...[[Scopus Author ID, 55233589900]][[Information engineering, Ph.D., Università d...[[Research Associate, Istituto di Scienza e Te...37[Scopus - Elsevier, Crossref Metadata Search, ...
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "8840413 0000-0002-5193-7851 1 1 1 \n", + "\n", + " given_names family_name biography other_names \\\n", + "8840413 Andrea Mannocci NaN NaN \n", + "\n", + " urls \\\n", + "8840413 [[Personal website, https://andremann.github.i... \n", + "\n", + " primary_email other_emails \\\n", + "8840413 andrea.mannocci@isti.cnr.it NaN \n", + "\n", + " keywords \\\n", + "8840413 [Data science , science of science, scholarly ... \n", + "\n", + " external_ids \\\n", + "8840413 [[Scopus Author ID, 55233589900]] \n", + "\n", + " education \\\n", + "8840413 [[Information engineering, Ph.D., Università d... \n", + "\n", + " employment n_works \\\n", + "8840413 [[Research Associate, Istituto di Scienza e Te... 37 \n", + "\n", + " works_source \n", + "8840413 [Scopus - Elsevier, Crossref Metadata Search, ... " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
95170990000-0001-6997-9470111otherwhatsappNaNNaN[[Otherwhatsapp, https://otherwhatsapp.com/], ...NaNNaN[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...NaNNaNNaN0NaN
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "9517099 0000-0001-6997-9470 1 1 1 \n", + "\n", + " given_names family_name biography other_names \\\n", + "9517099 other whatsapp NaN NaN \n", + "\n", + " urls primary_email \\\n", + "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN \n", + "\n", + " other_emails keywords \\\n", + "9517099 NaN [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n", + "\n", + " external_ids education employment n_works works_source \n", + "9517099 NaN NaN NaN 0 NaN " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == WHATSAPP]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orcid 10744622\n", + "claimed 10744622\n", + "verified_email 10744622\n", + "verified_primary_email 10744622\n", + "given_names 10716789\n", + "family_name 10437094\n", + "biography 333885\n", + "other_names 544550\n", + "urls 688262\n", + "primary_email 121476\n", + "other_emails 47470\n", + "keywords 638634\n", + "external_ids 1285292\n", + "education 2402440\n", + "employment 2626670\n", + "n_works 10744622\n", + "works_source 2671906\n", + "dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
45952630000-0002-5154-6404111OlusolaBamisileNaNNaNNaNNaNNaNNaNNaN[[Energy Systems Engineering , Doctoral, Cypru...[[, University of Electronic Science and Techn...3[Multidisciplinary Digital Publishing Institut...
45952640000-0002-5154-6404111OlusolaBamisileNaNNaNNaNNaNNaNNaNNaN[[Energy Systems Engineering , Doctoral, Cypru...[[, University of Electronic Science and Techn...2[Crossref]
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "4595263 0000-0002-5154-6404 1 1 1 \n", + "4595264 0000-0002-5154-6404 1 1 1 \n", + "\n", + " given_names family_name biography other_names urls primary_email \\\n", + "4595263 Olusola Bamisile NaN NaN NaN NaN \n", + "4595264 Olusola Bamisile NaN NaN NaN NaN \n", + "\n", + " other_emails keywords external_ids \\\n", + "4595263 NaN NaN NaN \n", + "4595264 NaN NaN NaN \n", + "\n", + " education \\\n", + "4595263 [[Energy Systems Engineering , Doctoral, Cypru... \n", + "4595264 [[Energy Systems Engineering , Doctoral, Cypru... \n", + "\n", + " employment n_works \\\n", + "4595263 [[, University of Electronic Science and Techn... 3 \n", + "4595264 [[, University of Electronic Science and Techn... 2 \n", + "\n", + " works_source \n", + "4595263 [Multidisciplinary Digital Publishing Institut... \n", + "4595264 [Crossref] " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == '0000-0002-5154-6404']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(index=4595264, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 10744621\n", + "unique 10744621\n", + "top 0000-0002-3376-9946\n", + "freq 1\n", + "Name: orcid, dtype: object" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['orcid'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Primary email" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 121476\n", + "unique 121473\n", + "top maykin@owasp.org\n", + "freq 2\n", + "Name: primary_email, dtype: object" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['primary_email'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dupe emails" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7483666 maykin@owasp.org\n", + "9068234 opercin@erbakan.edu.tr\n", + "10246485 patrick.davey@monash.edu\n", + "Name: primary_email, dtype: object" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['primary_email'].dropna().loc[df['primary_email'].duplicated()]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
37763500000-0002-0836-2271111MaykinWarasartNaNNaNNaNmaykin@owasp.org[maykin@dga.or.th]NaNNaNNaNNaN0NaN
74836660000-0001-9855-1676111MaykinWarasartNaNNaNNaNmaykin@owasp.org[maykin@dga.or.th, maykin@ieee.org]NaNNaNNaNNaN0NaN
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "3776350 0000-0002-0836-2271 1 1 1 \n", + "7483666 0000-0001-9855-1676 1 1 1 \n", + "\n", + " given_names family_name biography other_names urls primary_email \\\n", + "3776350 Maykin Warasart NaN NaN NaN maykin@owasp.org \n", + "7483666 Maykin Warasart NaN NaN NaN maykin@owasp.org \n", + "\n", + " other_emails keywords external_ids education \\\n", + "3776350 [maykin@dga.or.th] NaN NaN NaN \n", + "7483666 [maykin@dga.or.th, maykin@ieee.org] NaN NaN NaN \n", + "\n", + " employment n_works works_source \n", + "3776350 NaN 0 NaN \n", + "7483666 NaN 0 NaN " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['primary_email'] == 'maykin@owasp.org']" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
39950320000-0002-2232-9638111OsmanPerçinNaNNaNNaNopercin@erbakan.edu.trNaNNaNNaNNaNNaN0NaN
90682340000-0003-0033-0918111OsmanPERÇİNNaNNaNNaNopercin@erbakan.edu.trNaNNaNNaNNaN[[, Necmettin Erbakan University, Konya, , TR,...0NaN
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "3995032 0000-0002-2232-9638 1 1 1 \n", + "9068234 0000-0003-0033-0918 1 1 1 \n", + "\n", + " given_names family_name biography other_names urls \\\n", + "3995032 Osman Perçin NaN NaN NaN \n", + "9068234 Osman PERÇİN NaN NaN NaN \n", + "\n", + " primary_email other_emails keywords external_ids education \\\n", + "3995032 opercin@erbakan.edu.tr NaN NaN NaN NaN \n", + "9068234 opercin@erbakan.edu.tr NaN NaN NaN NaN \n", + "\n", + " employment n_works \\\n", + "3995032 NaN 0 \n", + "9068234 [[, Necmettin Erbakan University, Konya, , TR,... 0 \n", + "\n", + " works_source \n", + "3995032 NaN \n", + "9068234 NaN " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['primary_email'] == 'opercin@erbakan.edu.tr']" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
50877450000-0002-8774-0030111PatrickDaveyNaNNaNNaNpatrick.davey@monash.eduNaNNaNNaNNaN[[PhD Student, Monash University, Melbourne, V...1[Crossref]
102464850000-0002-9158-1757111PatrickDaveyNaNNaNNaNpatrick.davey@monash.eduNaN[Radiopharmaceuticals, Inorganic Chemistry, Bi...NaNNaN[[PhD Student, Monash University, Melbourne, ,...0NaN
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email \\\n", + "5087745 0000-0002-8774-0030 1 1 \n", + "10246485 0000-0002-9158-1757 1 1 \n", + "\n", + " verified_primary_email given_names family_name biography \\\n", + "5087745 1 Patrick Davey NaN \n", + "10246485 1 Patrick Davey NaN \n", + "\n", + " other_names urls primary_email other_emails \\\n", + "5087745 NaN NaN patrick.davey@monash.edu NaN \n", + "10246485 NaN NaN patrick.davey@monash.edu NaN \n", + "\n", + " keywords external_ids \\\n", + "5087745 NaN NaN \n", + "10246485 [Radiopharmaceuticals, Inorganic Chemistry, Bi... NaN \n", + "\n", + " education employment \\\n", + "5087745 NaN [[PhD Student, Monash University, Melbourne, V... \n", + "10246485 NaN [[PhD Student, Monash University, Melbourne, ,... \n", + "\n", + " n_works works_source \n", + "5087745 1 [Crossref] \n", + "10246485 0 NaN " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['primary_email'] == 'patrick.davey@monash.edu']" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 121476\n", + "unique 17047\n", + "top gmail.com\n", + "freq 25892\n", + "Name: primary_email_domain, dtype: object" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['primary_email_domain'].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcid
primary_email_domain
gmail.com25892
hotmail.com3674
yahoo.com2578
163.com2067
yuhs.ac1124
......
iiap.gob.pe1
iiap.org.pe1
iibb.csic.es1
iic.hokudai.ac.jp1
zzuli.edu.cn1
\n", + "

17047 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " orcid\n", + "primary_email_domain \n", + "gmail.com 25892\n", + "hotmail.com 3674\n", + "yahoo.com 2578\n", + "163.com 2067\n", + "yuhs.ac 1124\n", + "... ...\n", + "iiap.gob.pe 1\n", + "iiap.org.pe 1\n", + "iibb.csic.es 1\n", + "iic.hokudai.ac.jp 1\n", + "zzuli.edu.cn 1\n", + "\n", + "[17047 rows x 1 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)\n", + "primary_emails" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "gmail.com", + "hotmail.com", + "yahoo.com", + "163.com", + "yuhs.ac", + "qq.com", + "outlook.com", + "126.com", + "bu.edu", + "usgs.gov", + "mail.ru", + "usp.br", + "yahoo.com.br", + "ua.pt", + "umich.edu", + "ust.hk", + "foxmail.com", + "uomustansiriyah.edu.iq", + "yandex.ru", + "uq.edu.au", + "ukr.net", + "unesp.br", + "ucl.ac.uk", + "ieee.org", + "stcatz.ox.ac.uk", + "st-annes.ox.ac.uk", + "naver.com", + "yahoo.fr", + "ucm.es", + "live.com" + ], + "y": [ + 25892, + 3674, + 2578, + 2067, + 1124, + 1035, + 914, + 755, + 626, + 584, + 564, + 455, + 454, + 291, + 290, + 278, + 249, + 242, + 237, + 234, + 220, + 214, + 204, + 203, + 185, + 184, + 182, + 172, + 166, + 159 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "Top 30 email domains" + }, + "xaxis": { + "range": [ + -0.5, + 29.5 + ], + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = [\n", + " go.Bar(\n", + " x=primary_emails[:30].sort_values(by=['orcid'], ascending=False).index,\n", + " y=primary_emails[:30].sort_values(by=['orcid'], ascending=False)['orcid']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Top 30 email domains',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Other emails" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_email_domains(lst):\n", + " res = []\n", + " for email in lst:\n", + " res.append(email.split('@')[1])\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "df['other_email_domains'] = df['other_emails'].apply(lambda x: extract_email_domains(x) if isinstance(x, list) else x)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_sourceprimary_email_domainother_email_domains
340000-0001-5011-9833111MarkKilbaneNaNNaNNaNmark.kilbane@seh.ox.ac.uk[mark.kilbane@bsg.ox.ac.uk]NaNNaN[[Blavatnik School of Government; St Edmund Ha...NaN0NaNseh.ox.ac.uk[bsg.ox.ac.uk]
470000-0001-5017-1295111XinfengTangNaNNaNNaNNaN[tang.xinfeng@foxmail.com]NaN[[Scopus Author ID, 56927186900]][[, , University of Hong Kong, Hong Kong, , HK...NaN11[Scopus - Elsevier, Xinfeng Tang]NaN[foxmail.com]
2990000-0001-5109-3989111colintysallNaNNaNNaNNaN[colin.tysall@nhs.net]NaNNaNNaN[[Associate Mental Health Act Manager, Coventr...0NaNNaN[nhs.net]
8680000-0001-5320-1277111GökhanKESKİNNaNNaNNaN2012001598@stu.adu.edu.tr[gokhankkeskin@gmail.com]NaNNaNNaN[[, Adnan Menderes University, Aydin, , TR, gr...0NaNstu.adu.edu.tr[gmail.com]
11760000-0001-5434-9994111ElenaBorucuNaNNaNNaNlenapasali@gmail.com[epasali@yildiz.edu.tr]NaNNaNNaNNaN0NaNgmail.com[yildiz.edu.tr]
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "34 0000-0001-5011-9833 1 1 1 \n", + "47 0000-0001-5017-1295 1 1 1 \n", + "299 0000-0001-5109-3989 1 1 1 \n", + "868 0000-0001-5320-1277 1 1 1 \n", + "1176 0000-0001-5434-9994 1 1 1 \n", + "\n", + " given_names family_name biography other_names urls \\\n", + "34 Mark Kilbane NaN NaN NaN \n", + "47 Xinfeng Tang NaN NaN NaN \n", + "299 colin tysall NaN NaN NaN \n", + "868 Gökhan KESKİN NaN NaN NaN \n", + "1176 Elena Borucu NaN NaN NaN \n", + "\n", + " primary_email other_emails keywords \\\n", + "34 mark.kilbane@seh.ox.ac.uk [mark.kilbane@bsg.ox.ac.uk] NaN \n", + "47 NaN [tang.xinfeng@foxmail.com] NaN \n", + "299 NaN [colin.tysall@nhs.net] NaN \n", + "868 2012001598@stu.adu.edu.tr [gokhankkeskin@gmail.com] NaN \n", + "1176 lenapasali@gmail.com [epasali@yildiz.edu.tr] NaN \n", + "\n", + " external_ids \\\n", + "34 NaN \n", + "47 [[Scopus Author ID, 56927186900]] \n", + "299 NaN \n", + "868 NaN \n", + "1176 NaN \n", + "\n", + " education \\\n", + "34 [[Blavatnik School of Government; St Edmund Ha... \n", + "47 [[, , University of Hong Kong, Hong Kong, , HK... \n", + "299 NaN \n", + "868 NaN \n", + "1176 NaN \n", + "\n", + " employment n_works \\\n", + "34 NaN 0 \n", + "47 NaN 11 \n", + "299 [[Associate Mental Health Act Manager, Coventr... 0 \n", + "868 [[, Adnan Menderes University, Aydin, , TR, gr... 0 \n", + "1176 NaN 0 \n", + "\n", + " works_source primary_email_domain \\\n", + "34 NaN seh.ox.ac.uk \n", + "47 [Scopus - Elsevier, Xinfeng Tang] NaN \n", + "299 NaN NaN \n", + "868 NaN stu.adu.edu.tr \n", + "1176 NaN gmail.com \n", + "\n", + " other_email_domains \n", + "34 [bsg.ox.ac.uk] \n", + "47 [foxmail.com] \n", + "299 [nhs.net] \n", + "868 [gmail.com] \n", + "1176 [yildiz.edu.tr] " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['other_email_domains'].notna()].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "other_emails = df[['orcid', 'other_email_domains']].explode('other_email_domains').reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcid
other_email_domains
gmail.com10856
hotmail.com1521
yahoo.com1263
163.com763
qq.com755
......
ifzz.pan.pl1
ig.ufpa.br1
ig.ufu.br1
ig.utexas.edu1
zzuli.edu.cn1
\n", + "

12795 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " orcid\n", + "other_email_domains \n", + "gmail.com 10856\n", + "hotmail.com 1521\n", + "yahoo.com 1263\n", + "163.com 763\n", + "qq.com 755\n", + "... ...\n", + "ifzz.pan.pl 1\n", + "ig.ufpa.br 1\n", + "ig.ufu.br 1\n", + "ig.utexas.edu 1\n", + "zzuli.edu.cn 1\n", + "\n", + "[12795 rows x 1 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_other_emails = other_emails.groupby('other_email_domains').count().sort_values('orcid', ascending=False)\n", + "grouped_other_emails" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "gmail.com", + "hotmail.com", + "yahoo.com", + "163.com", + "qq.com", + "outlook.com", + "126.com", + "usp.br", + "ieee.org", + "mail.ru", + "yahoo.com.br", + "unesp.br", + "sbs.ox.ac.uk", + "yuhs.ac", + "naver.com", + "icloud.com", + "ua.pt", + "uq.edu.au", + "foxmail.com", + "cam.ac.uk", + "ukr.net", + "law.ox.ac.uk", + "imperial.ac.uk", + "mit.edu", + "monash.edu", + "ucl.ac.uk", + "education.ox.ac.uk", + "stanford.edu", + "ucm.es", + "conted.ox.ac.uk" + ], + "y": [ + 10856, + 1521, + 1263, + 763, + 755, + 422, + 256, + 235, + 223, + 147, + 146, + 138, + 136, + 130, + 128, + 113, + 92, + 90, + 90, + 81, + 76, + 75, + 75, + 74, + 69, + 67, + 67, + 66, + 65, + 64 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "Top 30 other email domains" + }, + "xaxis": { + "range": [ + -0.5, + 29.5 + ], + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = [\n", + " go.Bar(\n", + " x=grouped_other_emails[:30].sort_values(by=['orcid'], ascending=False).index,\n", + " y=grouped_other_emails[:30].sort_values(by=['orcid'], ascending=False)['orcid']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Top 30 other email domains',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
other_email_domains
orcid
0000-0003-4171-383512
0000-0001-6239-29689
0000-0003-2290-28177
0000-0003-2151-40897
0000-0001-9084-31566
......
0000-0002-1678-06680
0000-0002-1678-06840
0000-0002-1678-07050
0000-0002-1678-07130
0000-0003-5000-00010
\n", + "

10744621 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " other_email_domains\n", + "orcid \n", + "0000-0003-4171-3835 12\n", + "0000-0001-6239-2968 9\n", + "0000-0003-2290-2817 7\n", + "0000-0003-2151-4089 7\n", + "0000-0001-9084-3156 6\n", + "... ...\n", + "0000-0002-1678-0668 0\n", + "0000-0002-1678-0684 0\n", + "0000-0002-1678-0705 0\n", + "0000-0002-1678-0713 0\n", + "0000-0003-5000-0001 0\n", + "\n", + "[10744621 rows x 1 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "other_emails.groupby('orcid').count().sort_values('other_email_domains', ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Email speculation" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_sourceprimary_email_domainother_email_domains
470000-0001-5017-1295111XinfengTangNaNNaNNaNNaN[tang.xinfeng@foxmail.com]NaN[[Scopus Author ID, 56927186900]][[, , University of Hong Kong, Hong Kong, , HK...NaN11[Scopus - Elsevier, Xinfeng Tang]NaN[foxmail.com]
2990000-0001-5109-3989111colintysallNaNNaNNaNNaN[colin.tysall@nhs.net]NaNNaNNaN[[Associate Mental Health Act Manager, Coventr...0NaNNaN[nhs.net]
12960000-0001-5476-0126111Aura WindyHernández CetinaNaNNaNNaNNaN[u0902038@unimilitar.edu.co]NaNNaN[[, Profesional en Relaciones Internacionales ...[[Asistente de Investigación, Pontificia Unive...1[Aura Windy Hernández Cetina]NaN[unimilitar.edu.co]
14290000-0001-5522-427X111SüleymanÖzenNaNNaN[[Academic CV, https://akademik.yok.gov.tr/Aka...NaN[suleyman.ozen@btu.edu.tr][construction materials, superplasticizers, co...[[Scopus Author ID, 57188750603]][[Civil Engineering, MSc and PhD, Uludağ Unive...[[Dr., Bursa Technical University, Bursa, , TR...7[Scopus - Elsevier, Crossref]NaN[btu.edu.tr]
16280000-0001-5597-3115111WadeHarrisonNaNNaNNaNNaN[wade_harrison@unc.edu]NaNNaN[[, MD, Dartmouth College Geisel School of Med...[[Clinical Instructor / Research Fellow, Unive...7[Wade Harrison]NaN[unc.edu]
............................................................
107436580000-0003-3740-8352111RuiZhangNaNNaNNaNNaN[zhang-r15@mails.tsinghua.edu.cn][Lithium metal batteries, Graphene][[ResearcherID, B-3843-2015]][[Department of Chemical Engineering, Ph.D. st...NaN15[ResearcherID, Crossref]NaN[mails.tsinghua.edu.cn]
107448760000-0003-4192-6451111Sanjib RajPandeyNaNNaN[[Personal, https://www.sanjibpandey.wix.com/p...NaN[srpandey@gmail.com]NaNNaN[[Computing and Information System, PhD, Unive...[[Software Developer & Research Associate, Oxl...11[BASE - Bielefeld Academic Search Engine, Dr. ...NaN[gmail.com]
107452740000-0003-4333-9728111MarioDe la Fuente LloredaPerson in charge to coordinate the scientific ...[M.de la Fuente, De la Fuente, M.][[researchgate profile, https://www.researchga...NaN[mariofuente@gmail.com][vineyard management, grapevine, viticulture, ...[[Scopus Author ID, 47960975000]][[Producción Vegetal, Doctor en Viticultura, U...NaN3[Scopus - Elsevier]NaN[gmail.com]
107454170000-0003-4383-4745111JieYangNaNNaNNaNNaN[jyang@esat.kuleuven.be]NaNNaN[[faculty of engineering science, Dr., KU Leuv...NaN0NaNNaN[esat.kuleuven.be]
107467020000-0003-4878-2737111AlekseyAdamtsevichNaNNaN[[Moscow State University of Civil Engineering...NaN[AdamtsevichAO@mgsu.ru][concrete, calorimetry, cement, construction, ...[[Scopus Author ID, 56301531000], [ResearcherI...[[, Engineer (Industrial and Civil Engineering...[[Senior Researcher, Moscow State University o...25[Scopus - Elsevier, ResearcherID]NaN[mgsu.ru]
\n", + "

19409 rows × 19 columns

\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email \\\n", + "47 0000-0001-5017-1295 1 1 \n", + "299 0000-0001-5109-3989 1 1 \n", + "1296 0000-0001-5476-0126 1 1 \n", + "1429 0000-0001-5522-427X 1 1 \n", + "1628 0000-0001-5597-3115 1 1 \n", + "... ... ... ... \n", + "10743658 0000-0003-3740-8352 1 1 \n", + "10744876 0000-0003-4192-6451 1 1 \n", + "10745274 0000-0003-4333-9728 1 1 \n", + "10745417 0000-0003-4383-4745 1 1 \n", + "10746702 0000-0003-4878-2737 1 1 \n", + "\n", + " verified_primary_email given_names family_name \\\n", + "47 1 Xinfeng Tang \n", + "299 1 colin tysall \n", + "1296 1 Aura Windy Hernández Cetina \n", + "1429 1 Süleyman Özen \n", + "1628 1 Wade Harrison \n", + "... ... ... ... \n", + "10743658 1 Rui Zhang \n", + "10744876 1 Sanjib Raj Pandey \n", + "10745274 1 Mario De la Fuente Lloreda \n", + "10745417 1 Jie Yang \n", + "10746702 1 Aleksey Adamtsevich \n", + "\n", + " biography \\\n", + "47 NaN \n", + "299 NaN \n", + "1296 NaN \n", + "1429 NaN \n", + "1628 NaN \n", + "... ... \n", + "10743658 NaN \n", + "10744876 NaN \n", + "10745274 Person in charge to coordinate the scientific ... \n", + "10745417 NaN \n", + "10746702 NaN \n", + "\n", + " other_names \\\n", + "47 NaN \n", + "299 NaN \n", + "1296 NaN \n", + "1429 NaN \n", + "1628 NaN \n", + "... ... \n", + "10743658 NaN \n", + "10744876 NaN \n", + "10745274 [M.de la Fuente, De la Fuente, M.] \n", + "10745417 NaN \n", + "10746702 NaN \n", + "\n", + " urls primary_email \\\n", + "47 NaN NaN \n", + "299 NaN NaN \n", + "1296 NaN NaN \n", + "1429 [[Academic CV, https://akademik.yok.gov.tr/Aka... NaN \n", + "1628 NaN NaN \n", + "... ... ... \n", + "10743658 NaN NaN \n", + "10744876 [[Personal, https://www.sanjibpandey.wix.com/p... NaN \n", + "10745274 [[researchgate profile, https://www.researchga... NaN \n", + "10745417 NaN NaN \n", + "10746702 [[Moscow State University of Civil Engineering... NaN \n", + "\n", + " other_emails \\\n", + "47 [tang.xinfeng@foxmail.com] \n", + "299 [colin.tysall@nhs.net] \n", + "1296 [u0902038@unimilitar.edu.co] \n", + "1429 [suleyman.ozen@btu.edu.tr] \n", + "1628 [wade_harrison@unc.edu] \n", + "... ... \n", + "10743658 [zhang-r15@mails.tsinghua.edu.cn] \n", + "10744876 [srpandey@gmail.com] \n", + "10745274 [mariofuente@gmail.com] \n", + "10745417 [jyang@esat.kuleuven.be] \n", + "10746702 [AdamtsevichAO@mgsu.ru] \n", + "\n", + " keywords \\\n", + "47 NaN \n", + "299 NaN \n", + "1296 NaN \n", + "1429 [construction materials, superplasticizers, co... \n", + "1628 NaN \n", + "... ... \n", + "10743658 [Lithium metal batteries, Graphene] \n", + "10744876 NaN \n", + "10745274 [vineyard management, grapevine, viticulture, ... \n", + "10745417 NaN \n", + "10746702 [concrete, calorimetry, cement, construction, ... \n", + "\n", + " external_ids \\\n", + "47 [[Scopus Author ID, 56927186900]] \n", + "299 NaN \n", + "1296 NaN \n", + "1429 [[Scopus Author ID, 57188750603]] \n", + "1628 NaN \n", + "... ... \n", + "10743658 [[ResearcherID, B-3843-2015]] \n", + "10744876 NaN \n", + "10745274 [[Scopus Author ID, 47960975000]] \n", + "10745417 NaN \n", + "10746702 [[Scopus Author ID, 56301531000], [ResearcherI... \n", + "\n", + " education \\\n", + "47 [[, , University of Hong Kong, Hong Kong, , HK... \n", + "299 NaN \n", + "1296 [[, Profesional en Relaciones Internacionales ... \n", + "1429 [[Civil Engineering, MSc and PhD, Uludağ Unive... \n", + "1628 [[, MD, Dartmouth College Geisel School of Med... \n", + "... ... \n", + "10743658 [[Department of Chemical Engineering, Ph.D. st... \n", + "10744876 [[Computing and Information System, PhD, Unive... \n", + "10745274 [[Producción Vegetal, Doctor en Viticultura, U... \n", + "10745417 [[faculty of engineering science, Dr., KU Leuv... \n", + "10746702 [[, Engineer (Industrial and Civil Engineering... \n", + "\n", + " employment n_works \\\n", + "47 NaN 11 \n", + "299 [[Associate Mental Health Act Manager, Coventr... 0 \n", + "1296 [[Asistente de Investigación, Pontificia Unive... 1 \n", + "1429 [[Dr., Bursa Technical University, Bursa, , TR... 7 \n", + "1628 [[Clinical Instructor / Research Fellow, Unive... 7 \n", + "... ... ... \n", + "10743658 NaN 15 \n", + "10744876 [[Software Developer & Research Associate, Oxl... 11 \n", + "10745274 NaN 3 \n", + "10745417 NaN 0 \n", + "10746702 [[Senior Researcher, Moscow State University o... 25 \n", + "\n", + " works_source \\\n", + "47 [Scopus - Elsevier, Xinfeng Tang] \n", + "299 NaN \n", + "1296 [Aura Windy Hernández Cetina] \n", + "1429 [Scopus - Elsevier, Crossref] \n", + "1628 [Wade Harrison] \n", + "... ... \n", + "10743658 [ResearcherID, Crossref] \n", + "10744876 [BASE - Bielefeld Academic Search Engine, Dr. ... \n", + "10745274 [Scopus - Elsevier] \n", + "10745417 NaN \n", + "10746702 [Scopus - Elsevier, ResearcherID] \n", + "\n", + " primary_email_domain other_email_domains \n", + "47 NaN [foxmail.com] \n", + "299 NaN [nhs.net] \n", + "1296 NaN [unimilitar.edu.co] \n", + "1429 NaN [btu.edu.tr] \n", + "1628 NaN [unc.edu] \n", + "... ... ... \n", + "10743658 NaN [mails.tsinghua.edu.cn] \n", + "10744876 NaN [gmail.com] \n", + "10745274 NaN [gmail.com] \n", + "10745417 NaN [esat.kuleuven.be] \n", + "10746702 NaN [mgsu.ru] \n", + "\n", + "[19409 rows x 19 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['primary_email'].isna() & df['other_emails'].notna()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## URLs" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_url_domains(lst):\n", + " domains = []\n", + " for e in lst:\n", + " # e[0] is a string describing the url\n", + " # e[1] is the url\n", + " domain = tldextract.extract(e[1])\n", + " domains.append(domain.registered_domain)\n", + " return domains" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "df['url_domains'] = df['urls'].apply(lambda x: extract_url_domains(x) if isinstance(x, list) else x)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_sourceprimary_email_domainother_email_domainsurl_domains
50000-0001-5001-4994111SirenRühsI am an oceanographer studying the interannual...[Siren Ruehs][[ResearchGate, https://www.researchgate.net/p...NaNNaNNaNNaNNaNNaN11[Siren Rühs]NaNNaN[researchgate.net]
140000-0001-5004-7761111scaffoldinghireNaN[The first feature that you have to check in t...[[scaffolding hire Wellington, https://www.tig...NaNNaN[scaffolding hire Wellington]NaNNaNNaN0NaNNaNNaN[tigerscaffolds.co.nz]
150000-0001-5005-0557111SenRTNaNNaN[[Research on Psychology, psychiatry, Genetics...NaNNaNNaNNaNNaNNaN0NaNNaNNaN[corticalbrain.com]
290000-0001-5009-8091111GabrielaMadrugaPossui graduação em Medicina Veterinaria pela ...[Gabriela Morais Madruga][[Curriculo lattes, http://buscatextual.cnpq.b...NaNNaN[veterinary ophthalmology]NaN[[Surgery in small animal, PhD, Universidade E...[[PhD , University of Minnesota, Minneapolis, ...14[Gabriela Madruga]NaNNaN[cnpq.br]
300000-0001-5010-9539111Sangram KeshariSahuNaN[sk-sahu][[Academic webpage, https://sksahu.net]]NaNNaN[Computational Genomics and Bioinformatics][[Loop profile, 1098977]][[Centre for Bioinformatics, M.Sc. Bioinformat...[[Bioinformatics Junior Research Fellow, India...3[Crossref Metadata Search, Sangram Keshari Sahu]NaNNaN[sksahu.net]
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "5 0000-0001-5001-4994 1 1 1 \n", + "14 0000-0001-5004-7761 1 1 1 \n", + "15 0000-0001-5005-0557 1 1 1 \n", + "29 0000-0001-5009-8091 1 1 1 \n", + "30 0000-0001-5010-9539 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "5 Siren Rühs \n", + "14 scaffolding hire \n", + "15 Sen RT \n", + "29 Gabriela Madruga \n", + "30 Sangram Keshari Sahu \n", + "\n", + " biography \\\n", + "5 I am an oceanographer studying the interannual... \n", + "14 NaN \n", + "15 NaN \n", + "29 Possui graduação em Medicina Veterinaria pela ... \n", + "30 NaN \n", + "\n", + " other_names \\\n", + "5 [Siren Ruehs] \n", + "14 [The first feature that you have to check in t... \n", + "15 NaN \n", + "29 [Gabriela Morais Madruga] \n", + "30 [sk-sahu] \n", + "\n", + " urls primary_email \\\n", + "5 [[ResearchGate, https://www.researchgate.net/p... NaN \n", + "14 [[scaffolding hire Wellington, https://www.tig... NaN \n", + "15 [[Research on Psychology, psychiatry, Genetics... NaN \n", + "29 [[Curriculo lattes, http://buscatextual.cnpq.b... NaN \n", + "30 [[Academic webpage, https://sksahu.net]] NaN \n", + "\n", + " other_emails keywords \\\n", + "5 NaN NaN \n", + "14 NaN [scaffolding hire Wellington] \n", + "15 NaN NaN \n", + "29 NaN [veterinary ophthalmology] \n", + "30 NaN [Computational Genomics and Bioinformatics] \n", + "\n", + " external_ids \\\n", + "5 NaN \n", + "14 NaN \n", + "15 NaN \n", + "29 NaN \n", + "30 [[Loop profile, 1098977]] \n", + "\n", + " education \\\n", + "5 NaN \n", + "14 NaN \n", + "15 NaN \n", + "29 [[Surgery in small animal, PhD, Universidade E... \n", + "30 [[Centre for Bioinformatics, M.Sc. Bioinformat... \n", + "\n", + " employment n_works \\\n", + "5 NaN 11 \n", + "14 NaN 0 \n", + "15 NaN 0 \n", + "29 [[PhD , University of Minnesota, Minneapolis, ... 14 \n", + "30 [[Bioinformatics Junior Research Fellow, India... 3 \n", + "\n", + " works_source primary_email_domain \\\n", + "5 [Siren Rühs] NaN \n", + "14 NaN NaN \n", + "15 NaN NaN \n", + "29 [Gabriela Madruga] NaN \n", + "30 [Crossref Metadata Search, Sangram Keshari Sahu] NaN \n", + "\n", + " other_email_domains url_domains \n", + "5 NaN [researchgate.net] \n", + "14 NaN [tigerscaffolds.co.nz] \n", + "15 NaN [corticalbrain.com] \n", + "29 NaN [cnpq.br] \n", + "30 NaN [sksahu.net] " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['url_domains'].notna()].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "urls = df[['orcid', 'url_domains']].explode('url_domains').reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcid
url_domains
linkedin.com75344
researchgate.net66267
google.com43468
cnpq.br23936
academia.edu20786
......
gerberpumps.com1
gerbilvis.org1
gercekmedyumlar.org1
gerceksiyaset.com1
политуправление.рф1
\n", + "

193320 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " orcid\n", + "url_domains \n", + "linkedin.com 75344\n", + "researchgate.net 66267\n", + "google.com 43468\n", + "cnpq.br 23936\n", + "academia.edu 20786\n", + "... ...\n", + "gerberpumps.com 1\n", + "gerbilvis.org 1\n", + "gercekmedyumlar.org 1\n", + "gerceksiyaset.com 1\n", + "политуправление.рф 1\n", + "\n", + "[193320 rows x 1 columns]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_urls = urls.groupby('url_domains').count().sort_values('orcid', ascending=False)\n", + "grouped_urls" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "linkedin.com", + "researchgate.net", + "google.com", + "cnpq.br", + "academia.edu", + "twitter.com", + "facebook.com", + "publons.com", + "wordpress.com", + "mendeley.com", + "instagram.com", + "github.io", + "google.com.ua", + "blogspot.com", + "google.es", + "github.com", + "helsinki.fi", + "unirioja.es", + "youtube.com", + "wixsite.com", + "ku.dk", + "scopus.com", + "", + "weebly.com", + "us.es", + "kth.se", + "cityu.edu.hk", + "kcl.ac.uk", + "au.dk", + "ucl.ac.uk" + ], + "y": [ + 75344, + 66267, + 43468, + 23936, + 20786, + 18017, + 14552, + 10339, + 8883, + 7003, + 5532, + 5371, + 5273, + 5158, + 5070, + 5053, + 4682, + 4549, + 4196, + 4053, + 3730, + 3481, + 3332, + 3083, + 3029, + 2944, + 2719, + 2711, + 2640, + 2581 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "Top 30 URL domains" + }, + "xaxis": { + "range": [ + -0.5, + 29.5 + ], + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = [\n", + " go.Bar(\n", + " x=grouped_urls[:30].sort_values(by=['orcid'], ascending=False).index,\n", + " y=grouped_urls[:30].sort_values(by=['orcid'], ascending=False)['orcid']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Top 30 URL domains',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
url_domains
orcid
0000-0002-1234-835X219
0000-0001-7478-4539174
0000-0002-7392-3792169
0000-0002-6938-9638152
0000-0003-2450-090X114
......
0000-0002-1883-05690
0000-0002-1883-05770
0000-0002-1883-05850
0000-0002-1883-06060
0000-0003-5000-00010
\n", + "

10744621 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " url_domains\n", + "orcid \n", + "0000-0002-1234-835X 219\n", + "0000-0001-7478-4539 174\n", + "0000-0002-7392-3792 169\n", + "0000-0002-6938-9638 152\n", + "0000-0003-2450-090X 114\n", + "... ...\n", + "0000-0002-1883-0569 0\n", + "0000-0002-1883-0577 0\n", + "0000-0002-1883-0585 0\n", + "0000-0002-1883-0606 0\n", + "0000-0003-5000-0001 0\n", + "\n", + "[10744621 rows x 1 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_most_domains = urls.groupby('orcid').count().sort_values('url_domains', ascending=False)\n", + "grouped_most_domains" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "0000-0002-1234-835X", + "0000-0001-7478-4539", + "0000-0002-7392-3792", + "0000-0002-6938-9638", + "0000-0003-2450-090X", + "0000-0002-5710-4041", + "0000-0002-3920-7389", + "0000-0002-6689-4129", + "0000-0002-4621-5571", + "0000-0001-9131-1266", + "0000-0002-7754-8889", + "0000-0002-9025-8632", + "0000-0002-5250-1144", + "0000-0002-7456-3848", + "0000-0003-0176-1293", + "0000-0003-0321-7339", + "0000-0002-8493-0402", + "0000-0002-9965-2425", + "0000-0001-8873-6677", + "0000-0002-3997-5070", + "0000-0002-1856-6905", + "0000-0002-4316-1467", + "0000-0002-4062-3603", + "0000-0003-1524-6268", + "0000-0002-0752-7513", + "0000-0003-0594-2462", + "0000-0001-5880-7091", + "0000-0003-2593-7134", + "0000-0002-1298-5252", + "0000-0003-1761-3842", + "0000-0003-2383-8386", + "0000-0003-3546-2312", + "0000-0002-2886-9248", + "0000-0003-2183-8112", + "0000-0002-1929-6054", + "0000-0003-2407-3557", + "0000-0003-0796-0234", + "0000-0001-7133-6896", + "0000-0003-1484-6958", + "0000-0002-4305-4215", + "0000-0002-4004-6666", + "0000-0002-7568-3403", + "0000-0002-9276-6921", + "0000-0002-8208-0897", + "0000-0003-4993-5555", + "0000-0003-0930-6121", + "0000-0002-8116-9611", + "0000-0002-9071-5450", + "0000-0003-4948-9268", + "0000-0002-3277-9659", + "0000-0001-9559-1103", + "0000-0002-8122-879X", + "0000-0003-2862-6315", + "0000-0002-2000-8339", + "0000-0003-4808-6619", + "0000-0002-6254-8683", + "0000-0002-6547-0172", + "0000-0001-5300-4601", + "0000-0002-0971-9375", + "0000-0003-3933-0229", + "0000-0002-4659-5391", + "0000-0003-0694-1154", + "0000-0001-6783-2037", + "0000-0002-2916-2893", + "0000-0001-6461-2573", + "0000-0003-4501-3756", + "0000-0001-5549-6822", + "0000-0003-4326-9336", + "0000-0002-8940-3177", + "0000-0001-8096-4333", + "0000-0001-8978-4830", + "0000-0002-8593-9257", + "0000-0002-5946-1595", + "0000-0002-6680-1703", + "0000-0002-5196-4905", + "0000-0002-7653-4899", + "0000-0001-6921-0426", + "0000-0002-5139-2660", + "0000-0001-8808-4867", + "0000-0002-7843-8497", + "0000-0003-1675-2840", + "0000-0001-8644-2114", + "0000-0003-1815-1993", + "0000-0003-0907-9870", + "0000-0001-7784-0583", + "0000-0002-5265-6074", + "0000-0001-7550-5802", + "0000-0001-8986-2528", + "0000-0002-3334-9386", + "0000-0002-0696-8560", + "0000-0002-7179-6953", + "0000-0001-6979-4273", + "0000-0001-9102-8639", + "0000-0002-5985-9114", + "0000-0002-9771-600X", + "0000-0001-7193-5039", + "0000-0001-7608-9433", + "0000-0002-5241-1026", + "0000-0001-6714-009X", + "0000-0003-2133-2648" + ], + "y": [ + 219, + 174, + 169, + 152, + 114, + 114, + 111, + 104, + 90, + 83, + 83, + 81, + 81, + 80, + 80, + 80, + 76, + 73, + 72, + 71, + 70, + 69, + 69, + 68, + 68, + 68, + 68, + 67, + 67, + 66, + 66, + 65, + 64, + 61, + 61, + 59, + 57, + 57, + 57, + 57, + 57, + 57, + 57, + 56, + 55, + 55, + 55, + 55, + 51, + 50, + 50, + 50, + 49, + 49, + 48, + 48, + 48, + 48, + 47, + 47, + 46, + 46, + 45, + 45, + 45, + 45, + 44, + 43, + 43, + 43, + 43, + 42, + 42, + 42, + 41, + 41, + 40, + 40, + 40, + 39, + 39, + 39, + 39, + 38, + 38, + 38, + 38, + 38, + 37, + 37, + 37, + 37, + 37, + 36, + 36, + 36, + 36, + 36, + 36, + 36 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "Top 100 ORCID with URLs" + }, + "xaxis": { + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = [\n", + " go.Bar(\n", + " x=grouped_most_domains[:100].sort_values(by=['url_domains'], ascending=False).index,\n", + " y=grouped_most_domains[:100].sort_values(by=['url_domains'], ascending=False)['url_domains']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Top 100 ORCID with URLs',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_sourceprimary_email_domainother_email_domainsurl_domains
4828620000-0003-4948-9268111GustavoDuperréGustavo Norberto Duperré graduated in Arts and...[Gustavo Norberto Duperré, Duperré, G. N.][[Gis in Cultural Heritage - ICOMOS România, h...gustavo.duperre@usal.edu.arNaN[History of Art, Humanities, International Coo...[[Scopus Author ID, 57195936346], [ResearcherI...[[Programme in History, History of Art and Ter...[[Titular Professor, Dirección General de Cult...13[Gustavo Duperré, Scopus - Elsevier, Publons, ...usal.edu.arNaN[icomos.ro, unirioja.es, unirioja.es, unc.edu....
5548590000-0002-1929-6054111Franklin AméricoCanaza ChoqueDocente-Investigador Social. Maestrando en Der...[Franklin Américo Canaza-Choque , Franklin A. ...[[Consejo Nacional de Ciencia, Tecnología e In...Leo_123fa@hotmail.com[Leoameric123@gmail.com, Frankmericnazac@gmail...[Justicia Global; Democracia; Derechos Humanos...[[ResearcherID, P-8613-2018], [Loop profile, 8...[[Facultad de Ciencias de la Educación , Maest...[[Investigador Social, Universidad Católica de...38[ResearcherID, BASE - Bielefeld Academic Searc...hotmail.com[gmail.com, gmail.com, hotmail.com, baldwin.ed...[concytec.gob.pe, redalyc.org, redalyc.org, un...
13810920000-0002-9025-8632111buycannabisdispensaryWe procure and deliver premium cannabis strain...[We procure and deliver premium cannabis strai...[[find your cannabis & marijuana dispensary , ...NaNNaN[cannabis, cannabis culture, cannabis communit...NaNNaNNaN10[goowonderland dispensary]NaNNaN[goowonderland.com, goowonderland.com, goowond...
26793530000-0003-2407-3557111AbdulAzizAbdul Aziz was born on May 25, 1973, in Brebes...[Abdul Aziz, Aziz, Abdul, Aziz, A., Aziz, Abd,...[[Google Scholar, https://scholar.google.com/c...NaNNaN[Ilmu Ekonomi, Ekonomi Islam, Metodologi Penel...NaN[[Ilmu Ekonomi, Dr, Universitas Borobudur, Jak...[[Assisten Professor/Dr, Institut Agama Islam ...72[BASE - Bielefeld Academic Search Engine, Abdu...NaNNaN[google.com, syekhnurjati.ac.id, orcid.org, bl...
33544300000-0002-3920-7389111А.ГусевSurname, Name Gusev Alexander LeonidovichDate...[Alexander L. Gusev , Alexander Leonidovich Gu...[[A.L. Gusev Alternative Energy and Ecology, ...NaNNaN[Supercapacitors, Electrochromic, Photochromic...[[ResearcherID, F-8048-2014], [Scopus Author I...[[Chemical technology and cryogenic-vacuum tec...[[General Director, Scientific Technical Centr...472[Publons, DataCite, Scopus - Elsevier, A.L. Gu...NaNNaN[youtube.com, isjaee.com, researchgate.net, re...
40042810000-0002-5710-4041111RyszardRomaniukProfessor of Electronics and Communications En...[R.Romaniuk, R.S.Romaniuk, Ryszard Romaniuk, R...[[Scholar Google, http://scholar.google.pl/cit...rrom@ise.pw.edu.pl[R.Romaniuk@ise.pw.edu.pl, R.Romaniuk@elka.pw....[telecommunications, photonics, measurement sy...[[ISNI, 0000000071432485], [ResearcherID, B-91...[[Faculty of Electronics and Information Techn...[[Professor, Institute Director, Politechnika ...5008[INSPIRE-HEP, ResearcherID, ISNI2ORCID search ...ise.pw.edu.pl[ise.pw.edu.pl, elka.pw.edu.pl, cern.ch][google.pl, publons.com, scopus.com, mendeley....
40224800000-0003-2450-090X111EduardBabulakProfessor Eduard Babulak is accomplished inter...[Professor Eduard Babulak][[Honorary Chair, Chief Mentor & Senior Adviso...NaNNaN[Computer Security, Computer Networking, Inter...[[Scopus Author ID, 6506867432], [ResearcherID...[[Information Technology, Doctor Habilitated (...[[Consultant, Horizon 2020 Framework Programme...274[The Lens, BASE - Bielefeld Academic Search En...NaNNaN[worldassessmentcouncil.org, spseke.sk, bcs.or...
63353570000-0003-2593-7134111AanJaelaniAll my papers can be downloaded from portal:Re...[Jaelani, A., Jaelani, Aan][[Microsoft Academic Research, https://academi...aan_jaelani@syekhnurjati.ac.id[iainanjal@gmail.com][Islamic Economics, Tourism Industry, Islamic ...[[Scopus Author ID, 57195963463], [Loop profil...[[Post Graduate, S3/Dr, Universitas Islam Nege...[[Dr, Institut Agama Islam Negeri Syekh Nurjat...79[Publons, Aan Jaelani, Scopus - Elsevier, Dime...syekhnurjati.ac.id[gmail.com][microsoft.com, twitter.com, academia.edu, aca...
64898380000-0002-9965-2425111JaroslawSpychalaJaroslaw Spychala has received a doctoral degr...[Jaroslaw Jozef Spychala][[RESUME, http://www.biowebspin.com/wp-content...NaNNaN[organic chemistry, biochemistry, photochemist...[[Scopus Author ID, 7006745874]][[Department of Chemistry, Postdoctoral Associ...[[Assistant Professor, Adam Mickiewicz Univers...29[Scopus - Elsevier]NaNNaN[biowebspin.com, biowebspin.com, google.com, l...
75705840000-0003-2183-8112111Pelayo MunhozOleaPós-Doutorado em Gestão Ambiental pela Univers...[ Munhoz, Pelayo Olea, Olea, Pelayo, Olea, P...[[Currículo Lattes, http://lattes.cnpq.br/6209...NaNNaN[Inovação, Empreendedorismo, Sustentabilidade][[Scopus Author ID, 55175503300], [ResearcherI...[[, Postdoctoral in Environmental Sustainabili...[[Professor, Universidade Federal do Rio Grand...1105[The Lens, Pelayo Munhoz Olea, Dimensions, BAS...NaNNaN[cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c...
102405100000-0002-6938-9638111AdolfoCatral SanabriaMy education is in computer science, mathemati...NaN[[ResearchGate Adolfo Catral , https://www.res...NaNNaNNaN[[Loop profile, 747193]][[Education, Capacitación para la enseñanza en...NaN2023[BASE - Bielefeld Academic Search Engine, Data...NaNNaN[researchgate.net, youtube.com, linkedin.com, ...
104483040000-0002-4062-3603111JUAN DE DIOSBELTRÁN MANCILLAJUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut...[Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD...[[01.- Juan de Dios Beltrán Mancilla. Teoría O...NaNNaN[FILOSOFIA MEDICINA ARQUITECTURA ECONOMÍA DERE...NaN[[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR...[[INSPECTOR GENERAL JORNADA VESPERTINA // De 2...11[JUAN DE DIOS BELTR´´ÁN MANCILLA]NaNNaN[yumpu.com, ijopm.org, google.com, blogspot.co...
106638940000-0002-3997-5070111Dr. ParameshachariB DDr. Parameshachari B DACM Distinguished Speake...[Dr. PARAMESHACHARI B D][[GSSSIETW,MYSURU, http://geethashishu.in/], [...NaNNaN[Professor & Head |Dept. of TCE| GSSSIET for W...[[ResearcherID, F-7045-2018], [Scopus Author I...[[Electronics and Communication Engineering, P...[[ACM Distinguished Speaker (Volunteer), Assoc...93[Publons, Multidisciplinary Digital Publishing...NaNNaN[geethashishu.in, geethashishu.in, acm.org, go...
\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email \\\n", + "482862 0000-0003-4948-9268 1 1 \n", + "554859 0000-0002-1929-6054 1 1 \n", + "1381092 0000-0002-9025-8632 1 1 \n", + "2679353 0000-0003-2407-3557 1 1 \n", + "3354430 0000-0002-3920-7389 1 1 \n", + "4004281 0000-0002-5710-4041 1 1 \n", + "4022480 0000-0003-2450-090X 1 1 \n", + "6335357 0000-0003-2593-7134 1 1 \n", + "6489838 0000-0002-9965-2425 1 1 \n", + "7570584 0000-0003-2183-8112 1 1 \n", + "10240510 0000-0002-6938-9638 1 1 \n", + "10448304 0000-0002-4062-3603 1 1 \n", + "10663894 0000-0002-3997-5070 1 1 \n", + "\n", + " verified_primary_email given_names family_name \\\n", + "482862 1 Gustavo Duperré \n", + "554859 1 Franklin Américo Canaza Choque \n", + "1381092 1 buycannabis dispensary \n", + "2679353 1 Abdul Aziz \n", + "3354430 1 А. Гусев \n", + "4004281 1 Ryszard Romaniuk \n", + "4022480 1 Eduard Babulak \n", + "6335357 1 Aan Jaelani \n", + "6489838 1 Jaroslaw Spychala \n", + "7570584 1 Pelayo Munhoz Olea \n", + "10240510 1 Adolfo Catral Sanabria \n", + "10448304 1 JUAN DE DIOS BELTRÁN MANCILLA \n", + "10663894 1 Dr. Parameshachari B D \n", + "\n", + " biography \\\n", + "482862 Gustavo Norberto Duperré graduated in Arts and... \n", + "554859 Docente-Investigador Social. Maestrando en Der... \n", + "1381092 We procure and deliver premium cannabis strain... \n", + "2679353 Abdul Aziz was born on May 25, 1973, in Brebes... \n", + "3354430 Surname, Name Gusev Alexander LeonidovichDate... \n", + "4004281 Professor of Electronics and Communications En... \n", + "4022480 Professor Eduard Babulak is accomplished inter... \n", + "6335357 All my papers can be downloaded from portal:Re... \n", + "6489838 Jaroslaw Spychala has received a doctoral degr... \n", + "7570584 Pós-Doutorado em Gestão Ambiental pela Univers... \n", + "10240510 My education is in computer science, mathemati... \n", + "10448304 JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... \n", + "10663894 Dr. Parameshachari B DACM Distinguished Speake... \n", + "\n", + " other_names \\\n", + "482862 [Gustavo Norberto Duperré, Duperré, G. N.] \n", + "554859 [Franklin Américo Canaza-Choque , Franklin A. ... \n", + "1381092 [We procure and deliver premium cannabis strai... \n", + "2679353 [Abdul Aziz, Aziz, Abdul, Aziz, A., Aziz, Abd,... \n", + "3354430 [Alexander L. Gusev , Alexander Leonidovich Gu... \n", + "4004281 [R.Romaniuk, R.S.Romaniuk, Ryszard Romaniuk, R... \n", + "4022480 [Professor Eduard Babulak] \n", + "6335357 [Jaelani, A., Jaelani, Aan] \n", + "6489838 [Jaroslaw Jozef Spychala] \n", + "7570584 [ Munhoz, Pelayo Olea, Olea, Pelayo, Olea, P... \n", + "10240510 NaN \n", + "10448304 [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... \n", + "10663894 [Dr. PARAMESHACHARI B D] \n", + "\n", + " urls \\\n", + "482862 [[Gis in Cultural Heritage - ICOMOS România, h... \n", + "554859 [[Consejo Nacional de Ciencia, Tecnología e In... \n", + "1381092 [[find your cannabis & marijuana dispensary , ... \n", + "2679353 [[Google Scholar, https://scholar.google.com/c... \n", + "3354430 [[A.L. Gusev Alternative Energy and Ecology, ... \n", + "4004281 [[Scholar Google, http://scholar.google.pl/cit... \n", + "4022480 [[Honorary Chair, Chief Mentor & Senior Adviso... \n", + "6335357 [[Microsoft Academic Research, https://academi... \n", + "6489838 [[RESUME, http://www.biowebspin.com/wp-content... \n", + "7570584 [[Currículo Lattes, http://lattes.cnpq.br/6209... \n", + "10240510 [[ResearchGate Adolfo Catral , https://www.res... \n", + "10448304 [[01.- Juan de Dios Beltrán Mancilla. Teoría O... \n", + "10663894 [[GSSSIETW,MYSURU, http://geethashishu.in/], [... \n", + "\n", + " primary_email \\\n", + "482862 gustavo.duperre@usal.edu.ar \n", + "554859 Leo_123fa@hotmail.com \n", + "1381092 NaN \n", + "2679353 NaN \n", + "3354430 NaN \n", + "4004281 rrom@ise.pw.edu.pl \n", + "4022480 NaN \n", + "6335357 aan_jaelani@syekhnurjati.ac.id \n", + "6489838 NaN \n", + "7570584 NaN \n", + "10240510 NaN \n", + "10448304 NaN \n", + "10663894 NaN \n", + "\n", + " other_emails \\\n", + "482862 NaN \n", + "554859 [Leoameric123@gmail.com, Frankmericnazac@gmail... \n", + "1381092 NaN \n", + "2679353 NaN \n", + "3354430 NaN \n", + "4004281 [R.Romaniuk@ise.pw.edu.pl, R.Romaniuk@elka.pw.... \n", + "4022480 NaN \n", + "6335357 [iainanjal@gmail.com] \n", + "6489838 NaN \n", + "7570584 NaN \n", + "10240510 NaN \n", + "10448304 NaN \n", + "10663894 NaN \n", + "\n", + " keywords \\\n", + "482862 [History of Art, Humanities, International Coo... \n", + "554859 [Justicia Global; Democracia; Derechos Humanos... \n", + "1381092 [cannabis, cannabis culture, cannabis communit... \n", + "2679353 [Ilmu Ekonomi, Ekonomi Islam, Metodologi Penel... \n", + "3354430 [Supercapacitors, Electrochromic, Photochromic... \n", + "4004281 [telecommunications, photonics, measurement sy... \n", + "4022480 [Computer Security, Computer Networking, Inter... \n", + "6335357 [Islamic Economics, Tourism Industry, Islamic ... \n", + "6489838 [organic chemistry, biochemistry, photochemist... \n", + "7570584 [Inovação, Empreendedorismo, Sustentabilidade] \n", + "10240510 NaN \n", + "10448304 [FILOSOFIA MEDICINA ARQUITECTURA ECONOMÍA DERE... \n", + "10663894 [Professor & Head |Dept. of TCE| GSSSIET for W... \n", + "\n", + " external_ids \\\n", + "482862 [[Scopus Author ID, 57195936346], [ResearcherI... \n", + "554859 [[ResearcherID, P-8613-2018], [Loop profile, 8... \n", + "1381092 NaN \n", + "2679353 NaN \n", + "3354430 [[ResearcherID, F-8048-2014], [Scopus Author I... \n", + "4004281 [[ISNI, 0000000071432485], [ResearcherID, B-91... \n", + "4022480 [[Scopus Author ID, 6506867432], [ResearcherID... \n", + "6335357 [[Scopus Author ID, 57195963463], [Loop profil... \n", + "6489838 [[Scopus Author ID, 7006745874]] \n", + "7570584 [[Scopus Author ID, 55175503300], [ResearcherI... \n", + "10240510 [[Loop profile, 747193]] \n", + "10448304 NaN \n", + "10663894 [[ResearcherID, F-7045-2018], [Scopus Author I... \n", + "\n", + " education \\\n", + "482862 [[Programme in History, History of Art and Ter... \n", + "554859 [[Facultad de Ciencias de la Educación , Maest... \n", + "1381092 NaN \n", + "2679353 [[Ilmu Ekonomi, Dr, Universitas Borobudur, Jak... \n", + "3354430 [[Chemical technology and cryogenic-vacuum tec... \n", + "4004281 [[Faculty of Electronics and Information Techn... \n", + "4022480 [[Information Technology, Doctor Habilitated (... \n", + "6335357 [[Post Graduate, S3/Dr, Universitas Islam Nege... \n", + "6489838 [[Department of Chemistry, Postdoctoral Associ... \n", + "7570584 [[, Postdoctoral in Environmental Sustainabili... \n", + "10240510 [[Education, Capacitación para la enseñanza en... \n", + "10448304 [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... \n", + "10663894 [[Electronics and Communication Engineering, P... \n", + "\n", + " employment n_works \\\n", + "482862 [[Titular Professor, Dirección General de Cult... 13 \n", + "554859 [[Investigador Social, Universidad Católica de... 38 \n", + "1381092 NaN 10 \n", + "2679353 [[Assisten Professor/Dr, Institut Agama Islam ... 72 \n", + "3354430 [[General Director, Scientific Technical Centr... 472 \n", + "4004281 [[Professor, Institute Director, Politechnika ... 5008 \n", + "4022480 [[Consultant, Horizon 2020 Framework Programme... 274 \n", + "6335357 [[Dr, Institut Agama Islam Negeri Syekh Nurjat... 79 \n", + "6489838 [[Assistant Professor, Adam Mickiewicz Univers... 29 \n", + "7570584 [[Professor, Universidade Federal do Rio Grand... 1105 \n", + "10240510 NaN 2023 \n", + "10448304 [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... 11 \n", + "10663894 [[ACM Distinguished Speaker (Volunteer), Assoc... 93 \n", + "\n", + " works_source \\\n", + "482862 [Gustavo Duperré, Scopus - Elsevier, Publons, ... \n", + "554859 [ResearcherID, BASE - Bielefeld Academic Searc... \n", + "1381092 [goowonderland dispensary] \n", + "2679353 [BASE - Bielefeld Academic Search Engine, Abdu... \n", + "3354430 [Publons, DataCite, Scopus - Elsevier, A.L. Gu... \n", + "4004281 [INSPIRE-HEP, ResearcherID, ISNI2ORCID search ... \n", + "4022480 [The Lens, BASE - Bielefeld Academic Search En... \n", + "6335357 [Publons, Aan Jaelani, Scopus - Elsevier, Dime... \n", + "6489838 [Scopus - Elsevier] \n", + "7570584 [The Lens, Pelayo Munhoz Olea, Dimensions, BAS... \n", + "10240510 [BASE - Bielefeld Academic Search Engine, Data... \n", + "10448304 [JUAN DE DIOS BELTR´´ÁN MANCILLA] \n", + "10663894 [Publons, Multidisciplinary Digital Publishing... \n", + "\n", + " primary_email_domain \\\n", + "482862 usal.edu.ar \n", + "554859 hotmail.com \n", + "1381092 NaN \n", + "2679353 NaN \n", + "3354430 NaN \n", + "4004281 ise.pw.edu.pl \n", + "4022480 NaN \n", + "6335357 syekhnurjati.ac.id \n", + "6489838 NaN \n", + "7570584 NaN \n", + "10240510 NaN \n", + "10448304 NaN \n", + "10663894 NaN \n", + "\n", + " other_email_domains \\\n", + "482862 NaN \n", + "554859 [gmail.com, gmail.com, hotmail.com, baldwin.ed... \n", + "1381092 NaN \n", + "2679353 NaN \n", + "3354430 NaN \n", + "4004281 [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] \n", + "4022480 NaN \n", + "6335357 [gmail.com] \n", + "6489838 NaN \n", + "7570584 NaN \n", + "10240510 NaN \n", + "10448304 NaN \n", + "10663894 NaN \n", + "\n", + " url_domains \n", + "482862 [icomos.ro, unirioja.es, unirioja.es, unc.edu.... \n", + "554859 [concytec.gob.pe, redalyc.org, redalyc.org, un... \n", + "1381092 [goowonderland.com, goowonderland.com, goowond... \n", + "2679353 [google.com, syekhnurjati.ac.id, orcid.org, bl... \n", + "3354430 [youtube.com, isjaee.com, researchgate.net, re... \n", + "4004281 [google.pl, publons.com, scopus.com, mendeley.... \n", + "4022480 [worldassessmentcouncil.org, spseke.sk, bcs.or... \n", + "6335357 [microsoft.com, twitter.com, academia.edu, aca... \n", + "6489838 [biowebspin.com, biowebspin.com, google.com, l... \n", + "7570584 [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c... \n", + "10240510 [researchgate.net, youtube.com, linkedin.com, ... \n", + "10448304 [yumpu.com, ijopm.org, google.com, blogspot.co... \n", + "10663894 [geethashishu.in, geethashishu.in, acm.org, go... " + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_sourceprimary_email_domainother_email_domainsurl_domains
455660000-0003-1948-3180111MarkKatzMark N. Katz is a professor of government and ...NaN[[Adjusting to Change: American Foreign Policy...NaNNaNNaN[[Scopus Author ID, 25649901800]][[Political Science, Ph.D., Massachusetts Inst...[[Professor of Government and Politics, George...58[Scopus - Elsevier]NaNNaN[wordpress.com, marknkatz.com, gmu.edu, atlant...
726740000-0002-2000-8339111Phòng khám tư nhân Hà NộiNaNNaNNaN[[Sức khỏe, https://onhealth.vn/], [Khám phụ k...NaNNaNNaNNaNNaNNaN4[Phòng khám tư nhân Hà Nội]NaNNaN[onhealth.vn, onhealth.vn, onhealth.vn, onheal...
1728200000-0001-9293-2224111Juan CarlosGarcia HoyosMy name is Juan Carlos García Hoyos. I was bor...[Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /...[[Air Force Office of Scientific Research (WRI...NaNNaN[Exolinguistics, Ethnoastronomy, Sociology of ...NaN[[Faculty of Philosophy, History - Ph.D., Char...[[responsible for the Project Service Level Ag...20[Juan Carlos Garcia Hoyos]NaNNaN[af.mil, gst.com, govtribe.com, sbir.gov, open...
2095050000-0003-3045-0056111AnandaMajumdarI am Ananda Majumdar, Child Care Educator at B...NaN[[Migration Scholar and Ananda , https://grfdt...NaNNaNNaNNaN[[Education , B.Ed. After Degree , University ...[[General Coordinator- University of Alberta C...43[Ananda Majumdar]NaNNaN[grfdt.com, linkedin.com, academia.edu, resear...
2598770000-0003-1815-5732111JAS(Jurnal Akuntansi Syariah)JAS (Jurnal Akuntansi Syariah) published in pr...NaN[[Website, https://ejournal.stiesyariahbengkal...NaNNaN[Akuntansi, Akuntansi Syariah]NaNNaNNaN67[JAS (Jurnal Akuntansi Syariah)]NaNNaN[stiesyariahbengkalis.ac.id, lipi.go.id, cross...
...............................................................
104948200000-0002-1324-7171111Vanesa NataliaRodriguezNombre y Apellido: Vanesa Natalia Rodriguez. ...[Vanesa Rodriguez, Vanesa N. Rodriguez][[De rufianes y franchutas Representaciones y ...NaNNaN[Historia - Prostitución - Mujeres - Enfermeda...NaN[[, Maestría en Ciencias Sociales con Mención ...[[Profesora, Universidad Nacional de La Matanz...7[Vanesa Natalia Rodriguez]NaNNaN[unlam.edu.ar, unirioja.es, amazon.fr, abebook...
104958060000-0002-1700-8311111Fix-ITRiteNaN[Best Heating & Plumbing Company][[Website, https://fix-itrite.com], [Muckrack,...NaNNaN[Plumber, Appliance, Refrigerator, Repair , Se...NaNNaNNaN1[Fix-It Rite]NaNNaN[fix-itrite.com, muckrack.com, tumblr.com, dri...
106335450000-0003-2676-4431111BennySoewandiNaN[Benny Soewandi][[Conservation Efforts as a Result of Theoreti...NaNNaN[Researchers-Conservator for the Architectural...NaNNaN[[Membership, Paguyuban Pelestarian Budaya Ban...2[Benny Soewandi]NaNNaN[wordpress.com, wordpress.com, linkedin.com, f...
106482410000-0001-8157-0600111BijanYavarSenior Research Assistant and Phd Student in O...[B. Yavar, Yavar Bijan][[Web of Science (Pub) Researcher ID: A-3544-2...NaNNaN[Certainty and Uncertainty, Risk Analysis (Qua...[[Scopus Author ID, 56556873600]]NaNNaN6[Scopus - Elsevier]NaNNaN[publons.com, articulate.com, zenodo.org, orci...
106796990000-0002-9874-1450111FENGZHIWUNaNNaN[[A Systematic Study on the Dynamic Softening ...NaNNaNNaNNaNNaNNaN3[FENGZHI WU]NaNNaN[springer.com, sciencedirect.com, sciencedirec...
\n", + "

139 rows × 20 columns

\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email \\\n", + "45566 0000-0003-1948-3180 1 1 \n", + "72674 0000-0002-2000-8339 1 1 \n", + "172820 0000-0001-9293-2224 1 1 \n", + "209505 0000-0003-3045-0056 1 1 \n", + "259877 0000-0003-1815-5732 1 1 \n", + "... ... ... ... \n", + "10494820 0000-0002-1324-7171 1 1 \n", + "10495806 0000-0002-1700-8311 1 1 \n", + "10633545 0000-0003-2676-4431 1 1 \n", + "10648241 0000-0001-8157-0600 1 1 \n", + "10679699 0000-0002-9874-1450 1 1 \n", + "\n", + " verified_primary_email given_names \\\n", + "45566 1 Mark \n", + "72674 1 Phòng khám tư nhân Hà Nội \n", + "172820 1 Juan Carlos \n", + "209505 1 Ananda \n", + "259877 1 JAS \n", + "... ... ... \n", + "10494820 1 Vanesa Natalia \n", + "10495806 1 Fix-IT \n", + "10633545 1 Benny \n", + "10648241 1 Bijan \n", + "10679699 1 FENGZHI \n", + "\n", + " family_name \\\n", + "45566 Katz \n", + "72674 NaN \n", + "172820 Garcia Hoyos \n", + "209505 Majumdar \n", + "259877 (Jurnal Akuntansi Syariah) \n", + "... ... \n", + "10494820 Rodriguez \n", + "10495806 Rite \n", + "10633545 Soewandi \n", + "10648241 Yavar \n", + "10679699 WU \n", + "\n", + " biography \\\n", + "45566 Mark N. Katz is a professor of government and ... \n", + "72674 NaN \n", + "172820 My name is Juan Carlos García Hoyos. I was bor... \n", + "209505 I am Ananda Majumdar, Child Care Educator at B... \n", + "259877 JAS (Jurnal Akuntansi Syariah) published in pr... \n", + "... ... \n", + "10494820 Nombre y Apellido: Vanesa Natalia Rodriguez. ... \n", + "10495806 NaN \n", + "10633545 NaN \n", + "10648241 Senior Research Assistant and Phd Student in O... \n", + "10679699 NaN \n", + "\n", + " other_names \\\n", + "45566 NaN \n", + "72674 NaN \n", + "172820 [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... \n", + "209505 NaN \n", + "259877 NaN \n", + "... ... \n", + "10494820 [Vanesa Rodriguez, Vanesa N. Rodriguez] \n", + "10495806 [Best Heating & Plumbing Company] \n", + "10633545 [Benny Soewandi] \n", + "10648241 [B. Yavar, Yavar Bijan] \n", + "10679699 NaN \n", + "\n", + " urls primary_email \\\n", + "45566 [[Adjusting to Change: American Foreign Policy... NaN \n", + "72674 [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN \n", + "172820 [[Air Force Office of Scientific Research (WRI... NaN \n", + "209505 [[Migration Scholar and Ananda , https://grfdt... NaN \n", + "259877 [[Website, https://ejournal.stiesyariahbengkal... NaN \n", + "... ... ... \n", + "10494820 [[De rufianes y franchutas Representaciones y ... NaN \n", + "10495806 [[Website, https://fix-itrite.com], [Muckrack,... NaN \n", + "10633545 [[Conservation Efforts as a Result of Theoreti... NaN \n", + "10648241 [[Web of Science (Pub) Researcher ID: A-3544-2... NaN \n", + "10679699 [[A Systematic Study on the Dynamic Softening ... NaN \n", + "\n", + " other_emails keywords \\\n", + "45566 NaN NaN \n", + "72674 NaN NaN \n", + "172820 NaN [Exolinguistics, Ethnoastronomy, Sociology of ... \n", + "209505 NaN NaN \n", + "259877 NaN [Akuntansi, Akuntansi Syariah] \n", + "... ... ... \n", + "10494820 NaN [Historia - Prostitución - Mujeres - Enfermeda... \n", + "10495806 NaN [Plumber, Appliance, Refrigerator, Repair , Se... \n", + "10633545 NaN [Researchers-Conservator for the Architectural... \n", + "10648241 NaN [Certainty and Uncertainty, Risk Analysis (Qua... \n", + "10679699 NaN NaN \n", + "\n", + " external_ids \\\n", + "45566 [[Scopus Author ID, 25649901800]] \n", + "72674 NaN \n", + "172820 NaN \n", + "209505 NaN \n", + "259877 NaN \n", + "... ... \n", + "10494820 NaN \n", + "10495806 NaN \n", + "10633545 NaN \n", + "10648241 [[Scopus Author ID, 56556873600]] \n", + "10679699 NaN \n", + "\n", + " education \\\n", + "45566 [[Political Science, Ph.D., Massachusetts Inst... \n", + "72674 NaN \n", + "172820 [[Faculty of Philosophy, History - Ph.D., Char... \n", + "209505 [[Education , B.Ed. After Degree , University ... \n", + "259877 NaN \n", + "... ... \n", + "10494820 [[, Maestría en Ciencias Sociales con Mención ... \n", + "10495806 NaN \n", + "10633545 NaN \n", + "10648241 NaN \n", + "10679699 NaN \n", + "\n", + " employment n_works \\\n", + "45566 [[Professor of Government and Politics, George... 58 \n", + "72674 NaN 4 \n", + "172820 [[responsible for the Project Service Level Ag... 20 \n", + "209505 [[General Coordinator- University of Alberta C... 43 \n", + "259877 NaN 67 \n", + "... ... ... \n", + "10494820 [[Profesora, Universidad Nacional de La Matanz... 7 \n", + "10495806 NaN 1 \n", + "10633545 [[Membership, Paguyuban Pelestarian Budaya Ban... 2 \n", + "10648241 NaN 6 \n", + "10679699 NaN 3 \n", + "\n", + " works_source primary_email_domain \\\n", + "45566 [Scopus - Elsevier] NaN \n", + "72674 [Phòng khám tư nhân Hà Nội] NaN \n", + "172820 [Juan Carlos Garcia Hoyos] NaN \n", + "209505 [Ananda Majumdar] NaN \n", + "259877 [JAS (Jurnal Akuntansi Syariah)] NaN \n", + "... ... ... \n", + "10494820 [Vanesa Natalia Rodriguez] NaN \n", + "10495806 [Fix-It Rite] NaN \n", + "10633545 [Benny Soewandi] NaN \n", + "10648241 [Scopus - Elsevier] NaN \n", + "10679699 [FENGZHI WU] NaN \n", + "\n", + " other_email_domains \\\n", + "45566 NaN \n", + "72674 NaN \n", + "172820 NaN \n", + "209505 NaN \n", + "259877 NaN \n", + "... ... \n", + "10494820 NaN \n", + "10495806 NaN \n", + "10633545 NaN \n", + "10648241 NaN \n", + "10679699 NaN \n", + "\n", + " url_domains \n", + "45566 [wordpress.com, marknkatz.com, gmu.edu, atlant... \n", + "72674 [onhealth.vn, onhealth.vn, onhealth.vn, onheal... \n", + "172820 [af.mil, gst.com, govtribe.com, sbir.gov, open... \n", + "209505 [grfdt.com, linkedin.com, academia.edu, resear... \n", + "259877 [stiesyariahbengkalis.ac.id, lipi.go.id, cross... \n", + "... ... \n", + "10494820 [unlam.edu.ar, unirioja.es, amazon.fr, abebook... \n", + "10495806 [fix-itrite.com, muckrack.com, tumblr.com, dri... \n", + "10633545 [wordpress.com, wordpress.com, linkedin.com, f... \n", + "10648241 [publons.com, articulate.com, zenodo.org, orci... \n", + "10679699 [springer.com, sciencedirect.com, sciencedirec... \n", + "\n", + "[139 rows x 20 columns]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_sourceprimary_email_domainother_email_domainsurl_domains
00000-0003-1948-3180111MarkKatzMark N. Katz is a professor of government and ...NaN[[Adjusting to Change: American Foreign Policy...NaNNaNNaN[[Scopus Author ID, 25649901800]][[Political Science, Ph.D., Massachusetts Inst...[[Professor of Government and Politics, George...58Scopus - ElsevierNaNNaN[wordpress.com, marknkatz.com, gmu.edu, atlant...
10000-0002-2000-8339111Phòng khám tư nhân Hà NộiNaNNaNNaN[[Sức khỏe, https://onhealth.vn/], [Khám phụ k...NaNNaNNaNNaNNaNNaN4Phòng khám tư nhân Hà NộiNaNNaN[onhealth.vn, onhealth.vn, onhealth.vn, onheal...
20000-0001-9293-2224111Juan CarlosGarcia HoyosMy name is Juan Carlos García Hoyos. I was bor...[Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /...[[Air Force Office of Scientific Research (WRI...NaNNaN[Exolinguistics, Ethnoastronomy, Sociology of ...NaN[[Faculty of Philosophy, History - Ph.D., Char...[[responsible for the Project Service Level Ag...20Juan Carlos Garcia HoyosNaNNaN[af.mil, gst.com, govtribe.com, sbir.gov, open...
30000-0003-3045-0056111AnandaMajumdarI am Ananda Majumdar, Child Care Educator at B...NaN[[Migration Scholar and Ananda , https://grfdt...NaNNaNNaNNaN[[Education , B.Ed. After Degree , University ...[[General Coordinator- University of Alberta C...43Ananda MajumdarNaNNaN[grfdt.com, linkedin.com, academia.edu, resear...
40000-0003-1815-5732111JAS(Jurnal Akuntansi Syariah)JAS (Jurnal Akuntansi Syariah) published in pr...NaN[[Website, https://ejournal.stiesyariahbengkal...NaNNaN[Akuntansi, Akuntansi Syariah]NaNNaNNaN67JAS (Jurnal Akuntansi Syariah)NaNNaN[stiesyariahbengkalis.ac.id, lipi.go.id, cross...
...............................................................
1340000-0002-1324-7171111Vanesa NataliaRodriguezNombre y Apellido: Vanesa Natalia Rodriguez. ...[Vanesa Rodriguez, Vanesa N. Rodriguez][[De rufianes y franchutas Representaciones y ...NaNNaN[Historia - Prostitución - Mujeres - Enfermeda...NaN[[, Maestría en Ciencias Sociales con Mención ...[[Profesora, Universidad Nacional de La Matanz...7Vanesa Natalia RodriguezNaNNaN[unlam.edu.ar, unirioja.es, amazon.fr, abebook...
1350000-0002-1700-8311111Fix-ITRiteNaN[Best Heating & Plumbing Company][[Website, https://fix-itrite.com], [Muckrack,...NaNNaN[Plumber, Appliance, Refrigerator, Repair , Se...NaNNaNNaN1Fix-It RiteNaNNaN[fix-itrite.com, muckrack.com, tumblr.com, dri...
1360000-0003-2676-4431111BennySoewandiNaN[Benny Soewandi][[Conservation Efforts as a Result of Theoreti...NaNNaN[Researchers-Conservator for the Architectural...NaNNaN[[Membership, Paguyuban Pelestarian Budaya Ban...2Benny SoewandiNaNNaN[wordpress.com, wordpress.com, linkedin.com, f...
1370000-0001-8157-0600111BijanYavarSenior Research Assistant and Phd Student in O...[B. Yavar, Yavar Bijan][[Web of Science (Pub) Researcher ID: A-3544-2...NaNNaN[Certainty and Uncertainty, Risk Analysis (Qua...[[Scopus Author ID, 56556873600]]NaNNaN6Scopus - ElsevierNaNNaN[publons.com, articulate.com, zenodo.org, orci...
1380000-0002-9874-1450111FENGZHIWUNaNNaN[[A Systematic Study on the Dynamic Softening ...NaNNaNNaNNaNNaNNaN3FENGZHI WUNaNNaN[springer.com, sciencedirect.com, sciencedirec...
\n", + "

139 rows × 20 columns

\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "0 0000-0003-1948-3180 1 1 1 \n", + "1 0000-0002-2000-8339 1 1 1 \n", + "2 0000-0001-9293-2224 1 1 1 \n", + "3 0000-0003-3045-0056 1 1 1 \n", + "4 0000-0003-1815-5732 1 1 1 \n", + ".. ... ... ... ... \n", + "134 0000-0002-1324-7171 1 1 1 \n", + "135 0000-0002-1700-8311 1 1 1 \n", + "136 0000-0003-2676-4431 1 1 1 \n", + "137 0000-0001-8157-0600 1 1 1 \n", + "138 0000-0002-9874-1450 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "0 Mark Katz \n", + "1 Phòng khám tư nhân Hà Nội NaN \n", + "2 Juan Carlos Garcia Hoyos \n", + "3 Ananda Majumdar \n", + "4 JAS (Jurnal Akuntansi Syariah) \n", + ".. ... ... \n", + "134 Vanesa Natalia Rodriguez \n", + "135 Fix-IT Rite \n", + "136 Benny Soewandi \n", + "137 Bijan Yavar \n", + "138 FENGZHI WU \n", + "\n", + " biography \\\n", + "0 Mark N. Katz is a professor of government and ... \n", + "1 NaN \n", + "2 My name is Juan Carlos García Hoyos. I was bor... \n", + "3 I am Ananda Majumdar, Child Care Educator at B... \n", + "4 JAS (Jurnal Akuntansi Syariah) published in pr... \n", + ".. ... \n", + "134 Nombre y Apellido: Vanesa Natalia Rodriguez. ... \n", + "135 NaN \n", + "136 NaN \n", + "137 Senior Research Assistant and Phd Student in O... \n", + "138 NaN \n", + "\n", + " other_names \\\n", + "0 NaN \n", + "1 NaN \n", + "2 [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... \n", + "3 NaN \n", + "4 NaN \n", + ".. ... \n", + "134 [Vanesa Rodriguez, Vanesa N. Rodriguez] \n", + "135 [Best Heating & Plumbing Company] \n", + "136 [Benny Soewandi] \n", + "137 [B. Yavar, Yavar Bijan] \n", + "138 NaN \n", + "\n", + " urls primary_email \\\n", + "0 [[Adjusting to Change: American Foreign Policy... NaN \n", + "1 [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN \n", + "2 [[Air Force Office of Scientific Research (WRI... NaN \n", + "3 [[Migration Scholar and Ananda , https://grfdt... NaN \n", + "4 [[Website, https://ejournal.stiesyariahbengkal... NaN \n", + ".. ... ... \n", + "134 [[De rufianes y franchutas Representaciones y ... NaN \n", + "135 [[Website, https://fix-itrite.com], [Muckrack,... NaN \n", + "136 [[Conservation Efforts as a Result of Theoreti... NaN \n", + "137 [[Web of Science (Pub) Researcher ID: A-3544-2... NaN \n", + "138 [[A Systematic Study on the Dynamic Softening ... NaN \n", + "\n", + " other_emails keywords \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN [Exolinguistics, Ethnoastronomy, Sociology of ... \n", + "3 NaN NaN \n", + "4 NaN [Akuntansi, Akuntansi Syariah] \n", + ".. ... ... \n", + "134 NaN [Historia - Prostitución - Mujeres - Enfermeda... \n", + "135 NaN [Plumber, Appliance, Refrigerator, Repair , Se... \n", + "136 NaN [Researchers-Conservator for the Architectural... \n", + "137 NaN [Certainty and Uncertainty, Risk Analysis (Qua... \n", + "138 NaN NaN \n", + "\n", + " external_ids \\\n", + "0 [[Scopus Author ID, 25649901800]] \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + ".. ... \n", + "134 NaN \n", + "135 NaN \n", + "136 NaN \n", + "137 [[Scopus Author ID, 56556873600]] \n", + "138 NaN \n", + "\n", + " education \\\n", + "0 [[Political Science, Ph.D., Massachusetts Inst... \n", + "1 NaN \n", + "2 [[Faculty of Philosophy, History - Ph.D., Char... \n", + "3 [[Education , B.Ed. After Degree , University ... \n", + "4 NaN \n", + ".. ... \n", + "134 [[, Maestría en Ciencias Sociales con Mención ... \n", + "135 NaN \n", + "136 NaN \n", + "137 NaN \n", + "138 NaN \n", + "\n", + " employment n_works \\\n", + "0 [[Professor of Government and Politics, George... 58 \n", + "1 NaN 4 \n", + "2 [[responsible for the Project Service Level Ag... 20 \n", + "3 [[General Coordinator- University of Alberta C... 43 \n", + "4 NaN 67 \n", + ".. ... ... \n", + "134 [[Profesora, Universidad Nacional de La Matanz... 7 \n", + "135 NaN 1 \n", + "136 [[Membership, Paguyuban Pelestarian Budaya Ban... 2 \n", + "137 NaN 6 \n", + "138 NaN 3 \n", + "\n", + " works_source primary_email_domain other_email_domains \\\n", + "0 Scopus - Elsevier NaN NaN \n", + "1 Phòng khám tư nhân Hà Nội NaN NaN \n", + "2 Juan Carlos Garcia Hoyos NaN NaN \n", + "3 Ananda Majumdar NaN NaN \n", + "4 JAS (Jurnal Akuntansi Syariah) NaN NaN \n", + ".. ... ... ... \n", + "134 Vanesa Natalia Rodriguez NaN NaN \n", + "135 Fix-It Rite NaN NaN \n", + "136 Benny Soewandi NaN NaN \n", + "137 Scopus - Elsevier NaN NaN \n", + "138 FENGZHI WU NaN NaN \n", + "\n", + " url_domains \n", + "0 [wordpress.com, marknkatz.com, gmu.edu, atlant... \n", + "1 [onhealth.vn, onhealth.vn, onhealth.vn, onheal... \n", + "2 [af.mil, gst.com, govtribe.com, sbir.gov, open... \n", + "3 [grfdt.com, linkedin.com, academia.edu, resear... \n", + "4 [stiesyariahbengkalis.ac.id, lipi.go.id, cross... \n", + ".. ... \n", + "134 [unlam.edu.ar, unirioja.es, amazon.fr, abebook... \n", + "135 [fix-itrite.com, muckrack.com, tumblr.com, dri... \n", + "136 [wordpress.com, wordpress.com, linkedin.com, f... \n", + "137 [publons.com, articulate.com, zenodo.org, orci... \n", + "138 [springer.com, sciencedirect.com, sciencedirec... \n", + "\n", + "[139 rows x 20 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n", + "exploded_sources" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_sourceprimary_email_domainother_email_domainsurl_domains
10000-0002-2000-8339111Phòng khám tư nhân Hà NộiNaNNaNNaN[[Sức khỏe, https://onhealth.vn/], [Khám phụ k...NaNNaNNaNNaNNaNNaN4Phòng khám tư nhân Hà NộiNaNNaN[onhealth.vn, onhealth.vn, onhealth.vn, onheal...
20000-0001-9293-2224111Juan CarlosGarcia HoyosMy name is Juan Carlos García Hoyos. I was bor...[Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /...[[Air Force Office of Scientific Research (WRI...NaNNaN[Exolinguistics, Ethnoastronomy, Sociology of ...NaN[[Faculty of Philosophy, History - Ph.D., Char...[[responsible for the Project Service Level Ag...20Juan Carlos Garcia HoyosNaNNaN[af.mil, gst.com, govtribe.com, sbir.gov, open...
30000-0003-3045-0056111AnandaMajumdarI am Ananda Majumdar, Child Care Educator at B...NaN[[Migration Scholar and Ananda , https://grfdt...NaNNaNNaNNaN[[Education , B.Ed. After Degree , University ...[[General Coordinator- University of Alberta C...43Ananda MajumdarNaNNaN[grfdt.com, linkedin.com, academia.edu, resear...
40000-0003-1815-5732111JAS(Jurnal Akuntansi Syariah)JAS (Jurnal Akuntansi Syariah) published in pr...NaN[[Website, https://ejournal.stiesyariahbengkal...NaNNaN[Akuntansi, Akuntansi Syariah]NaNNaNNaN67JAS (Jurnal Akuntansi Syariah)NaNNaN[stiesyariahbengkalis.ac.id, lipi.go.id, cross...
50000-0002-4379-6454111Caroline WanjiruKariukiCaroline holds a PhD in Economics from Curtin ...NaN[[Scopus Profile, https://www.scopus.com/dashb...NaNNaN[Applied Econometrics, Development Economics, ...NaN[[Economics, Doctor of Philosophy , Curtin Uni...[[Director, Educational Development, Strathmor...4Caroline Wanjiru KariukiNaNNaN[scopus.com, mendeley.com, publons.com, resear...
...............................................................
1320000-0001-6352-7086111SusanHawthorneSusan is a poet, novelist, publisher and Sansk...[S. Hawthorne, Susan C. C. Hawthorne][[Spinifex Press, http://www.spinifexpress.com...NaNNaN[Womens Studies, Philosophy, Ancient Greek, Sa...[[ResearcherID, K-6039-2018]][[School of Asian Studies, Honours Sanskrit, A...[[Adjunct Professor, James Cook University, To...352Susan HawthorneNaNNaN[spinifexpress.com.au, linkedin.com, twitter.c...
1330000-0002-4062-3603111JUAN DE DIOSBELTRÁN MANCILLAJUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut...[Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD...[[01.- Juan de Dios Beltrán Mancilla. Teoría O...NaNNaN[FILOSOFIA MEDICINA ARQUITECTURA ECONOMÍA DERE...NaN[[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR...[[INSPECTOR GENERAL JORNADA VESPERTINA // De 2...11JUAN DE DIOS BELTR´´ÁN MANCILLANaNNaN[yumpu.com, ijopm.org, google.com, blogspot.co...
1340000-0002-1324-7171111Vanesa NataliaRodriguezNombre y Apellido: Vanesa Natalia Rodriguez. ...[Vanesa Rodriguez, Vanesa N. Rodriguez][[De rufianes y franchutas Representaciones y ...NaNNaN[Historia - Prostitución - Mujeres - Enfermeda...NaN[[, Maestría en Ciencias Sociales con Mención ...[[Profesora, Universidad Nacional de La Matanz...7Vanesa Natalia RodriguezNaNNaN[unlam.edu.ar, unirioja.es, amazon.fr, abebook...
1360000-0003-2676-4431111BennySoewandiNaN[Benny Soewandi][[Conservation Efforts as a Result of Theoreti...NaNNaN[Researchers-Conservator for the Architectural...NaNNaN[[Membership, Paguyuban Pelestarian Budaya Ban...2Benny SoewandiNaNNaN[wordpress.com, wordpress.com, linkedin.com, f...
1380000-0002-9874-1450111FENGZHIWUNaNNaN[[A Systematic Study on the Dynamic Softening ...NaNNaNNaNNaNNaNNaN3FENGZHI WUNaNNaN[springer.com, sciencedirect.com, sciencedirec...
\n", + "

108 rows × 20 columns

\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "1 0000-0002-2000-8339 1 1 1 \n", + "2 0000-0001-9293-2224 1 1 1 \n", + "3 0000-0003-3045-0056 1 1 1 \n", + "4 0000-0003-1815-5732 1 1 1 \n", + "5 0000-0002-4379-6454 1 1 1 \n", + ".. ... ... ... ... \n", + "132 0000-0001-6352-7086 1 1 1 \n", + "133 0000-0002-4062-3603 1 1 1 \n", + "134 0000-0002-1324-7171 1 1 1 \n", + "136 0000-0003-2676-4431 1 1 1 \n", + "138 0000-0002-9874-1450 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "1 Phòng khám tư nhân Hà Nội NaN \n", + "2 Juan Carlos Garcia Hoyos \n", + "3 Ananda Majumdar \n", + "4 JAS (Jurnal Akuntansi Syariah) \n", + "5 Caroline Wanjiru Kariuki \n", + ".. ... ... \n", + "132 Susan Hawthorne \n", + "133 JUAN DE DIOS BELTRÁN MANCILLA \n", + "134 Vanesa Natalia Rodriguez \n", + "136 Benny Soewandi \n", + "138 FENGZHI WU \n", + "\n", + " biography \\\n", + "1 NaN \n", + "2 My name is Juan Carlos García Hoyos. I was bor... \n", + "3 I am Ananda Majumdar, Child Care Educator at B... \n", + "4 JAS (Jurnal Akuntansi Syariah) published in pr... \n", + "5 Caroline holds a PhD in Economics from Curtin ... \n", + ".. ... \n", + "132 Susan is a poet, novelist, publisher and Sansk... \n", + "133 JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... \n", + "134 Nombre y Apellido: Vanesa Natalia Rodriguez. ... \n", + "136 NaN \n", + "138 NaN \n", + "\n", + " other_names \\\n", + "1 NaN \n", + "2 [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + ".. ... \n", + "132 [S. Hawthorne, Susan C. C. Hawthorne] \n", + "133 [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... \n", + "134 [Vanesa Rodriguez, Vanesa N. Rodriguez] \n", + "136 [Benny Soewandi] \n", + "138 NaN \n", + "\n", + " urls primary_email \\\n", + "1 [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN \n", + "2 [[Air Force Office of Scientific Research (WRI... NaN \n", + "3 [[Migration Scholar and Ananda , https://grfdt... NaN \n", + "4 [[Website, https://ejournal.stiesyariahbengkal... NaN \n", + "5 [[Scopus Profile, https://www.scopus.com/dashb... NaN \n", + ".. ... ... \n", + "132 [[Spinifex Press, http://www.spinifexpress.com... NaN \n", + "133 [[01.- Juan de Dios Beltrán Mancilla. Teoría O... NaN \n", + "134 [[De rufianes y franchutas Representaciones y ... NaN \n", + "136 [[Conservation Efforts as a Result of Theoreti... NaN \n", + "138 [[A Systematic Study on the Dynamic Softening ... NaN \n", + "\n", + " other_emails keywords \\\n", + "1 NaN NaN \n", + "2 NaN [Exolinguistics, Ethnoastronomy, Sociology of ... \n", + "3 NaN NaN \n", + "4 NaN [Akuntansi, Akuntansi Syariah] \n", + "5 NaN [Applied Econometrics, Development Economics, ... \n", + ".. ... ... \n", + "132 NaN [Womens Studies, Philosophy, Ancient Greek, Sa... \n", + "133 NaN [FILOSOFIA MEDICINA ARQUITECTURA ECONOMÍA DERE... \n", + "134 NaN [Historia - Prostitución - Mujeres - Enfermeda... \n", + "136 NaN [Researchers-Conservator for the Architectural... \n", + "138 NaN NaN \n", + "\n", + " external_ids \\\n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + ".. ... \n", + "132 [[ResearcherID, K-6039-2018]] \n", + "133 NaN \n", + "134 NaN \n", + "136 NaN \n", + "138 NaN \n", + "\n", + " education \\\n", + "1 NaN \n", + "2 [[Faculty of Philosophy, History - Ph.D., Char... \n", + "3 [[Education , B.Ed. After Degree , University ... \n", + "4 NaN \n", + "5 [[Economics, Doctor of Philosophy , Curtin Uni... \n", + ".. ... \n", + "132 [[School of Asian Studies, Honours Sanskrit, A... \n", + "133 [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... \n", + "134 [[, Maestría en Ciencias Sociales con Mención ... \n", + "136 NaN \n", + "138 NaN \n", + "\n", + " employment n_works \\\n", + "1 NaN 4 \n", + "2 [[responsible for the Project Service Level Ag... 20 \n", + "3 [[General Coordinator- University of Alberta C... 43 \n", + "4 NaN 67 \n", + "5 [[Director, Educational Development, Strathmor... 4 \n", + ".. ... ... \n", + "132 [[Adjunct Professor, James Cook University, To... 352 \n", + "133 [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... 11 \n", + "134 [[Profesora, Universidad Nacional de La Matanz... 7 \n", + "136 [[Membership, Paguyuban Pelestarian Budaya Ban... 2 \n", + "138 NaN 3 \n", + "\n", + " works_source primary_email_domain other_email_domains \\\n", + "1 Phòng khám tư nhân Hà Nội NaN NaN \n", + "2 Juan Carlos Garcia Hoyos NaN NaN \n", + "3 Ananda Majumdar NaN NaN \n", + "4 JAS (Jurnal Akuntansi Syariah) NaN NaN \n", + "5 Caroline Wanjiru Kariuki NaN NaN \n", + ".. ... ... ... \n", + "132 Susan Hawthorne NaN NaN \n", + "133 JUAN DE DIOS BELTR´´ÁN MANCILLA NaN NaN \n", + "134 Vanesa Natalia Rodriguez NaN NaN \n", + "136 Benny Soewandi NaN NaN \n", + "138 FENGZHI WU NaN NaN \n", + "\n", + " url_domains \n", + "1 [onhealth.vn, onhealth.vn, onhealth.vn, onheal... \n", + "2 [af.mil, gst.com, govtribe.com, sbir.gov, open... \n", + "3 [grfdt.com, linkedin.com, academia.edu, resear... \n", + "4 [stiesyariahbengkalis.ac.id, lipi.go.id, cross... \n", + "5 [scopus.com, mendeley.com, publons.com, resear... \n", + ".. ... \n", + "132 [spinifexpress.com.au, linkedin.com, twitter.c... \n", + "133 [yumpu.com, ijopm.org, google.com, blogspot.co... \n", + "134 [unlam.edu.ar, unirioja.es, amazon.fr, abebook... \n", + "136 [wordpress.com, wordpress.com, linkedin.com, f... \n", + "138 [springer.com, sciencedirect.com, sciencedirec... \n", + "\n", + "[108 rows x 20 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Works source" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_own_source(lst, own):\n", + " res = []\n", + " if isinstance(lst, list) and pd.notna(own):\n", + " for ws in lst:\n", + " if ws.find(own) == -1:\n", + " res.append(ws)\n", + " return res\n", + " else:\n", + " return np.na()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'np' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ext_works_source'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mremove_own_source\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'works_source'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'given_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, raw, result_type, args, **kwds)\u001b[0m\n\u001b[1;32m 7766\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7767\u001b[0m )\n\u001b[0;32m-> 7768\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7769\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7770\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_action\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_raw\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_empty_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mres_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_series_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;31m# wrap results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_series_generator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0;31m# ignore SettingWithCopy here in case the user mutates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 290\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 291\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mABCSeries\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;31m# If we have a view on v, we need to make a copy because\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ext_works_source'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mremove_own_source\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'works_source'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'given_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mremove_own_source\u001b[0;34m(lst, own)\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'np' is not defined" + ] + } + ], + "source": [ + "df['ext_works_source'] = df.apply(lambda x: remove_own_source(x['works_source'], x['given_names']), axis=1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/02-Spam filter.ipynb b/notebooks/02-Spam filter.ipynb new file mode 100644 index 0000000..903ef75 --- /dev/null +++ b/notebooks/02-Spam filter.ipynb @@ -0,0 +1,1096 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import ast\n", + "import tldextract\n", + "import numpy" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "# Notable Solid ORCID iDs for debug purposes\n", + "AM = '0000-0002-5193-7851'\n", + "PP = '0000-0002-8588-4196'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# Notable fake ORCID iDs for debug purposes\n", + "SCAFFOLD = '0000-0001-5004-7761'\n", + "WHATSAPP = '0000-0001-6997-9470'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n", + " dtype = {'orcid': pd.StringDtype(), \n", + " 'claimed': bool, \n", + " 'verifyed email': bool, \n", + " 'verified primary email': bool,\n", + " 'given names': pd.StringDtype(),\n", + " 'family name': pd.StringDtype(),\n", + " 'biography': pd.StringDtype(),\n", + " 'other names': pd.StringDtype(),\n", + " 'researcher urls': pd.StringDtype(),\n", + " 'primary email': pd.StringDtype(),\n", + " 'other emails': pd.StringDtype(),\n", + " 'keywords': pd.StringDtype(),\n", + " 'external identifiers': pd.StringDtype(),\n", + " 'education': pd.StringDtype(),\n", + " 'employments': pd.StringDtype(),\n", + " 'number of works': pd.Int16Dtype(),\n", + " 'works source': pd.StringDtype()})" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "df['given names'] = df['given names'].fillna('')\n", + "df['family name'] = df['family name'].fillna('')\n", + "df['biography'] = df['biography'].fillna('')\n", + "df['primary email'] = df['primary email'].fillna('')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "df['other names'] = df['other names'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_url_domains(lst):\n", + " domains = []\n", + " for e in lst:\n", + " # e[0] is a string describing the url\n", + " # e[1] is the url\n", + " domain = tldextract.extract(e[1])\n", + " domains.append(domain.registered_domain)\n", + " return domains" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_education(lst):\n", + " educations = []\n", + " for e in lst:\n", + " # e[0] degree\n", + " # e[1] role\n", + " # e[2] university\n", + " # e[..] city, region, country, id, id_scheme\n", + " educations.append(' '.join([e[0], e[1], e[2]]))\n", + " return educations" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_employment(lst):\n", + " res = []\n", + " for e in lst:\n", + " # e[0] role\n", + " # e[1] institute\n", + " # e[..] city, region, country, id, id_scheme\n", + " res.append(' '.join([e[0], e[1]]))\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_email_domains(lst):\n", + " res = []\n", + " for email in lst:\n", + " res.append(email.replace('@', ' '))\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "df['concat'] = df['given names'] + ' ' + df['family name'] + '\\n' + \\\n", + " df['other names'].apply(lambda x: ' '.join(x)) + '\\n' + \\\n", + " df['primary email'].values[0].replace('@', ' ') + '\\n' + \\\n", + " df['other emails'].apply(lambda x: ' '.join(extract_email_domains(x))) + '\\n' + \\\n", + " df['biography'] + '\\n' + \\\n", + " df['keywords'].apply(lambda x: ' - '.join(x)) + '\\n' + \\\n", + " df['url_domains'].apply(lambda x: ' '.join(x)) + '\\n' + \\\n", + " df['education'].apply(lambda x: '\\n'.join(extract_education(x))) + '\\n' + \\\n", + " df['employments'].apply(lambda x: '\\n'.join(extract_employment(x)))" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Andrea Mannocci\n", + "\n", + "\n", + "\n", + "\n", + "Data science - science of science - scholarly knowledge mining - open science - research infrastructures\n", + "github.io twitter.com linkedin.com\n", + "Information engineering Ph.D. Università degli Studi di Pisa\n", + "Telematics engineering M.Sc. Universidad Carlos III de Madrid\n", + "Computer engineering M.Sc. Università degli Studi di Pisa\n", + "Computer engineering B.Sc. Università degli Studi di Pisa\n", + "Research Associate Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche\n", + "Research Associate The Open University\n", + "Research assistant Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche\n", + "Research assistant IMDEA Networks\n", + "Research assistant Syddansk Universitet\n" + ] + } + ], + "source": [ + "print(df[df['orcid'] == AM]['concat'].values[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df['number of works'] > 0, 'label'] = True" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceurl_domainsconcatlabel
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci[][[Personal website, https://andremann.github.i...andrea.mannocci@isti.cnr.it[][Data science , science of science, scholarly ...[[\"Scopus Author ID\", \"55233589900\"]][[Information engineering, Ph.D., Università d...[[Research Associate, Istituto di Scienza e Te...37[\"Scopus - Elsevier\", \"Crossref Metadata Searc...[github.io, twitter.com, linkedin.com]0000-0002-5193-7851\n", + "Andrea Mannocci\n", + "\n", + "andrea.ma...True
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "8840413 0000-0002-5193-7851 True True True \n", + "\n", + " given names family name biography other names \\\n", + "8840413 Andrea Mannocci [] \n", + "\n", + " researcher urls \\\n", + "8840413 [[Personal website, https://andremann.github.i... \n", + "\n", + " primary email other emails \\\n", + "8840413 andrea.mannocci@isti.cnr.it [] \n", + "\n", + " keywords \\\n", + "8840413 [Data science , science of science, scholarly ... \n", + "\n", + " external identifiers \\\n", + "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", + "\n", + " education \\\n", + "8840413 [[Information engineering, Ph.D., Università d... \n", + "\n", + " employments number of works \\\n", + "8840413 [[Research Associate, Istituto di Scienza e Te... 37 \n", + "\n", + " works source \\\n", + "8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... \n", + "\n", + " url_domains \\\n", + "8840413 [github.io, twitter.com, linkedin.com] \n", + "\n", + " concat label \n", + "8840413 0000-0002-5193-7851\n", + "Andrea Mannocci\n", + "\n", + "andrea.ma... True " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceurl_domainsconcatlabel
140000-0001-5004-7761TrueTrueTruescaffoldinghire[The first feature that you have to check in t...[[scaffolding hire Wellington, https://www.tig...[][scaffolding hire Wellington]<NA>[][]0<NA>[tigerscaffolds.co.nz]0000-0001-5004-7761\n", + "scaffolding hire\n", + "The first...NaN
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "14 0000-0001-5004-7761 True True True \n", + "\n", + " given names family name biography \\\n", + "14 scaffolding hire \n", + "\n", + " other names \\\n", + "14 [The first feature that you have to check in t... \n", + "\n", + " researcher urls primary email \\\n", + "14 [[scaffolding hire Wellington, https://www.tig... \n", + "\n", + " other emails keywords external identifiers education \\\n", + "14 [] [scaffolding hire Wellington] [] \n", + "\n", + " employments number of works works source url_domains \\\n", + "14 [] 0 [tigerscaffolds.co.nz] \n", + "\n", + " concat label \n", + "14 0000-0001-5004-7761\n", + "scaffolding hire\n", + "The first... NaN " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == SCAFFOLD]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
concatlabel
00000-0001-5000-2053\n", + "Jorge Jaramillo Sanchez \n", + "...NaN
10000-0001-5000-6548\n", + "Wiseman BekelesiNaN
20000-0001-5000-7962\n", + "ALICE INDIMULINaN
30000-0001-5000-8586\n", + "shim ji yunNaN
40000-0001-5001-0256\n", + "Sandro CaramaschiNaN
.........
107470350000-0003-4998-1551\n", + "Animesh GhoshNaN
107470360000-0003-4998-4111\n", + "Hawa LibernaNaN
107470370000-0003-4998-6045\n", + "Tongyi MenNaN
107470380000-0003-4998-8868\n", + "Charldon WilkenNaN
107470390000-0003-4999-7916\n", + "Tapas Bapu B.R.NaN
\n", + "

10747040 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " concat label\n", + "0 0000-0001-5000-2053\n", + "Jorge Jaramillo Sanchez \n", + "... NaN\n", + "1 0000-0001-5000-6548\n", + "Wiseman Bekelesi\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "2 0000-0001-5000-7962\n", + "ALICE INDIMULI\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "3 0000-0001-5000-8586\n", + "shim ji yun\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "4 0000-0001-5001-0256\n", + "Sandro Caramaschi\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "... ... ...\n", + "10747035 0000-0003-4998-1551\n", + "Animesh Ghosh\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "10747036 0000-0003-4998-4111\n", + "Hawa Liberna\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "10747037 0000-0003-4998-6045\n", + "Tongyi Men\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "10747038 0000-0003-4998-8868\n", + "Charldon Wilken\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "10747039 0000-0003-4999-7916\n", + "Tapas Bapu B.R.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " NaN\n", + "\n", + "[10747040 rows x 2 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df[['orcid', 'concat', 'label']]\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pre-trained spam filter as-is" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'seaborn'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m \u001b[0;31m# linear algebra\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m \u001b[0;31m# data processing, CSV file I/O (e.g. pd.read_csv)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mstopwords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'seaborn'" + ] + } + ], + "source": [ + "import string\n", + "import torch\n", + "import transformers\n", + "import numpy as np # linear algebra\n", + "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from nltk.corpus import stopwords\n", + "from sklearn.manifold import TSNE\n", + "from nltk.tokenize import word_tokenize\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# One-Class SVM" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.svm import OneClassSVM\n", + "from sklearn.model_selection import train_test_split\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples['features'] = vectorizer.fit_transform(samples['concat'])\n" + ] + } + ], + "source": [ + "samples = df[df['label'] == True]\n", + "vectorizer = TfidfVectorizer()\n", + "samples['features'] = vectorizer.fit_transform(samples['concat'])" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "13 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "24 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "26 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "29 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + " ... \n", + "10747024 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "10747026 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "10747027 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "10747030 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "10747034 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n", + "Name: features, Length: 2674451, dtype: object" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['features']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainX, testX, trainy, testy = train_test_split(samples['features'], samples['label'], test_size=0.7, random_state=3, stratify=y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = OneClassSVM(gamma='scale', nu=0.01)\n", + "model.fit(trainX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "could not convert string to float: '0000-0001-5001-4994'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_classes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight, **params)\u001b[0m\n\u001b[1;32m 1374\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1375\u001b[0m \"\"\"\n\u001b[0;32m-> 1376\u001b[0;31m super().fit(X, np.ones(_num_samples(X)),\n\u001b[0m\u001b[1;32m 1377\u001b[0m sample_weight=sample_weight, **params)\n\u001b[1;32m 1378\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moffset_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_intercept_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 169\u001b[0;31m X, y = self._validate_data(X, y, dtype=np.float64,\n\u001b[0m\u001b[1;32m 170\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'C'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 171\u001b[0m accept_large_sparse=False)\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 434\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"y cannot be None\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m X = check_array(X, accept_sparse=accept_sparse,\n\u001b[0m\u001b[1;32m 815\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhas_pd_integer_array\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 559\u001b[0m \u001b[0;31m# If there are any pandas integer extension arrays,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 560\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 561\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 562\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'allow-nan'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 5875\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5876\u001b[0m \u001b[0;31m# else, only a single dtype is given\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5877\u001b[0;31m \u001b[0mnew_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5878\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"astype\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5879\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 629\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"raise\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 630\u001b[0m ) -> \"BlockManager\":\n\u001b[0;32m--> 631\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"astype\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 632\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 633\u001b[0m def convert(\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mTypeError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mignore_failures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/blocks.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_extension\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 648\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 649\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 650\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"ignore\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/string_.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m 306\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 307\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 308\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 309\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 310\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: could not convert string to float: '0000-0001-5001-4994'" + ] + } + ], + "source": [ + "model.fit(df[df['label'] == True])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BERT" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Language model Databunch\n", + "from fast_bert.data_lm import BertLMDataBunch\n", + "# Language model learner\n", + "from fast_bert.learner_lm import BertLMLearner\n", + "\n", + "from pathlib import Path\n", + "from box import Box\n", + "\n", + "import logging\n", + "logger = logging.getLogger()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# Box is a nice wrapper to create an object from a json dict\n", + "args = Box({\n", + " \"seed\": 42,\n", + " \"task_name\": 'imdb_reviews_lm',\n", + " \"model_name\": 'roberta-base',\n", + " \"model_type\": 'roberta',\n", + " \"train_batch_size\": 16,\n", + " \"learning_rate\": 4e-5,\n", + " \"num_train_epochs\": 20,\n", + " \"fp16\": True,\n", + " \"fp16_opt_level\": \"O2\",\n", + " \"warmup_steps\": 1000,\n", + " \"logging_steps\": 0,\n", + " \"max_seq_length\": 512,\n", + " \"multi_gpu\": False\n", + "})\n", + "\n", + "DATA_PATH = Path('../data/processed')\n", + "LOG_PATH = Path('../logs')\n", + "MODEL_PATH = Path('../models/lm_model_{}/'.format(args.model_type))\n", + "\n", + "DATA_PATH.mkdir(exist_ok=True)\n", + "MODEL_PATH.mkdir(exist_ok=True)\n", + "LOG_PATH.mkdir(exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " 1.27% [123222/9672336 00:03<04:04]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "UnicodeEncodeError", + "evalue": "'utf-8' codec can't encode characters in position 162-163: surrogates not allowed", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m databunch_lm = BertLMDataBunch.from_raw_corpus(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdata_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDATA_PATH\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtext_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'concat'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mbatch_size_per_gpu\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_batch_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py\u001b[0m in \u001b[0;36mfrom_raw_corpus\u001b[0;34m(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)\u001b[0m\n\u001b[1;32m 191\u001b[0m )\n\u001b[1;32m 192\u001b[0m \u001b[0;31m# Create train corpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 193\u001b[0;31m \u001b[0mcreate_corpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mtrain_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogger\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 194\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;31m# Create val corpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py\u001b[0m in \u001b[0;36mcreate_corpus\u001b[0;34m(text_list, target_path, logger)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mUnicodeEncodeError\u001b[0m: 'utf-8' codec can't encode characters in position 162-163: surrogates not allowed" + ] + } + ], + "source": [ + "databunch_lm = BertLMDataBunch.from_raw_corpus(\n", + " data_dir=DATA_PATH,\n", + " text_list=df['concat'],\n", + " tokenizer=args.model_name,\n", + " batch_size_per_gpu=args.train_batch_size,\n", + " max_seq_length=args.max_seq_length,\n", + " multi_gpu=args.multi_gpu,\n", + " model_type=args.model_type,\n", + " logger=logger)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/03-Feature extraction.ipynb b/notebooks/03-Feature extraction.ipynb new file mode 100644 index 0000000..e25ef16 --- /dev/null +++ b/notebooks/03-Feature extraction.ipynb @@ -0,0 +1,2422 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Todo in data\n", + "- Column names -> no space\n", + "- If a list is empty, serialise [] in the csv\n", + "- If a string is empty, serialise '' in the csv" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "from urllib.parse import urlparse\n", + "import tldextract\n", + "\n", + "import pandas as pd\n", + "from sklearn.preprocessing import MultiLabelBinarizer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "mlb = MultiLabelBinarizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Notable Solid ORCID iDs for debug purposes\n", + "AM = '0000-0002-5193-7851'\n", + "PP = '0000-0002-8588-4196'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Notable fake ORCID iDs for debug purposes\n", + "SCAFFOLD = '0000-0001-5004-7761'\n", + "WHATSAPP = '0000-0001-6997-9470'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header = 0,\n", + " dtype = {\"orcid\": pd.StringDtype(), \n", + " \"claimed\": bool, \n", + " \"verifyed email\": bool, \n", + " \"verified primary email\": bool,\n", + " \"given names\": pd.StringDtype(),\n", + " \"family name\": pd.StringDtype(),\n", + " \"biography\": pd.StringDtype(),\n", + " \"other names\": pd.StringDtype(),\n", + " \"researcher urls\": pd.StringDtype(),\n", + " \"primary email\": pd.StringDtype(),\n", + " \"other emails\": pd.StringDtype(),\n", + " \"keywords\": pd.StringDtype(),\n", + " \"eternal identifiers\": pd.StringDtype(),\n", + " \"education\": pd.StringDtype(),\n", + " \"employments\": pd.StringDtype(),\n", + " \"number of works\": pd.Int16Dtype(),\n", + " \"works source\": pd.StringDtype()})" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks source
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "0 0000-0001-5000-2053 True False False \n", + "1 0000-0001-5000-6548 True False False \n", + "2 0000-0001-5000-7962 True True True \n", + "3 0000-0001-5000-8586 True False False \n", + "4 0000-0001-5001-0256 True False False \n", + "\n", + " given names family name biography other names researcher urls \\\n", + "0 Jorge Jaramillo Sanchez \n", + "1 Wiseman Bekelesi \n", + "2 ALICE INDIMULI \n", + "3 shim ji yun \n", + "4 Sandro Caramaschi \n", + "\n", + " primary email other emails keywords external identifiers education \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "\n", + " employments number of works works source \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks source
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Information engineering\", \"Ph.D.\", \"Univers...[[\"Research Associate\", \"Istituto di Scienza e...37[\"Scopus - Elsevier\", \"Crossref Metadata Searc...
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "8840413 0000-0002-5193-7851 True True True \n", + "\n", + " given names family name biography other names \\\n", + "8840413 Andrea Mannocci \n", + "\n", + " researcher urls \\\n", + "8840413 [[\"Personal website\", \"https://andremann.githu... \n", + "\n", + " primary email other emails \\\n", + "8840413 andrea.mannocci@isti.cnr.it \n", + "\n", + " keywords \\\n", + "8840413 [\"Data science \", \"science of science\", \"schol... \n", + "\n", + " external identifiers \\\n", + "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", + "\n", + " education \\\n", + "8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n", + "\n", + " employments number of works \\\n", + "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", + "\n", + " works source \n", + "8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extracting works source" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_work_source(lst):\n", + " extracted = []\n", + " for s in lst:\n", + " if 'Scopus - Elsevier' in s or 'Crossref' in s:\n", + " extracted.append(s)\n", + " return extracted" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksCrossrefCrossref Metadata SearchScopus - Elsevier
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Information engineering\", \"Ph.D.\", \"Univers...[[\"Research Associate\", \"Istituto di Scienza e...37111
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "8840413 0000-0002-5193-7851 True True True \n", + "\n", + " given names family name biography other names \\\n", + "8840413 Andrea Mannocci \n", + "\n", + " researcher urls \\\n", + "8840413 [[\"Personal website\", \"https://andremann.githu... \n", + "\n", + " primary email other emails \\\n", + "8840413 andrea.mannocci@isti.cnr.it \n", + "\n", + " keywords \\\n", + "8840413 [\"Data science \", \"science of science\", \"schol... \n", + "\n", + " external identifiers \\\n", + "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", + "\n", + " education \\\n", + "8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n", + "\n", + " employments number of works \\\n", + "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", + "\n", + " Crossref Crossref Metadata Search Scopus - Elsevier \n", + "8840413 1 1 1 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Education" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "df['n_education'] = df['education'].str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop('education', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifiersemploymentsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_education
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Research Associate\", \"Istituto di Scienza e...371114
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "8840413 0000-0002-5193-7851 True True True \n", + "\n", + " given names family name biography other names \\\n", + "8840413 Andrea Mannocci \n", + "\n", + " researcher urls \\\n", + "8840413 [[\"Personal website\", \"https://andremann.githu... \n", + "\n", + " primary email other emails \\\n", + "8840413 andrea.mannocci@isti.cnr.it \n", + "\n", + " keywords \\\n", + "8840413 [\"Data science \", \"science of science\", \"schol... \n", + "\n", + " external identifiers \\\n", + "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", + "\n", + " employments number of works \\\n", + "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", + "\n", + " Crossref Crossref Metadata Search Scopus - Elsevier n_education \n", + "8840413 1 1 1 4 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Employment" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "df['n_employments'] = df['employments'].str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop('employments', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifiersnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employments
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]]3711145
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "8840413 0000-0002-5193-7851 True True True \n", + "\n", + " given names family name biography other names \\\n", + "8840413 Andrea Mannocci \n", + "\n", + " researcher urls \\\n", + "8840413 [[\"Personal website\", \"https://andremann.githu... \n", + "\n", + " primary email other emails \\\n", + "8840413 andrea.mannocci@isti.cnr.it \n", + "\n", + " keywords \\\n", + "8840413 [\"Data science \", \"science of science\", \"schol... \n", + "\n", + " external identifiers number of works Crossref \\\n", + "8840413 [[\"Scopus Author ID\", \"55233589900\"]] 37 1 \n", + "\n", + " Crossref Metadata Search Scopus - Elsevier n_education \\\n", + "8840413 1 1 4 \n", + "\n", + " n_employments \n", + "8840413 5 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# External IDs" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# def extract_ids(lst):\n", + "# extracted = []\n", + "# for id in lst:\n", + "# extracted.append(id[0])\n", + "# return extracted" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "df['n_ext_ids'] = df['external identifiers'].str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['external identifiers'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employmentsn_ext_ids
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...37111451
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "8840413 0000-0002-5193-7851 True True True \n", + "\n", + " given names family name biography other names \\\n", + "8840413 Andrea Mannocci \n", + "\n", + " researcher urls \\\n", + "8840413 [[\"Personal website\", \"https://andremann.githu... \n", + "\n", + " primary email other emails \\\n", + "8840413 andrea.mannocci@isti.cnr.it \n", + "\n", + " keywords number of works \\\n", + "8840413 [\"Data science \", \"science of science\", \"schol... 37 \n", + "\n", + " Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n", + "8840413 1 1 1 4 \n", + "\n", + " n_employments n_ext_ids \n", + "8840413 5 1 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extracting email domains" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "df['primary email'] = df['primary email'].fillna('')\n", + "df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_email_domains(row):\n", + " domains = []\n", + " if len(row['primary email']) > 0:\n", + " domains.append(row['primary email'].split('@')[1])\n", + " for email in row['other emails']:\n", + " domains.append(email.split('@')[1])\n", + " return domains" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "34 [seh.ox.ac.uk, bsg.ox.ac.uk]\n", + "47 [foxmail.com]\n", + "103 [fvtm.bu.edu.eg]\n", + "297 [unipa.it]\n", + "299 [nhs.net]\n", + " ... \n", + "10746811 [gva.es, gmail.com]\n", + "10746850 [cinvestav.mx]\n", + "10746920 [gmail.com, hotmail.com]\n", + "10746975 [mail.ru]\n", + "10746988 [ucm.es]\n", + "Name: email_domains, Length: 141118, dtype: object" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['email_domains'].str.len() != 0]['email_domains']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extracting URL domains" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_url_domains(lst):\n", + " domains = []\n", + " for e in lst:\n", + " # e[0] is a string describing the url\n", + " # e[1] is the url\n", + " ext = tldextract.extract(e[1])\n", + " domains.append(ext.registered_domain)\n", + " return domains" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5 [researchgate.net]\n", + "14 [tigerscaffolds.co.nz]\n", + "15 [corticalbrain.com]\n", + "29 [cnpq.br]\n", + "30 [sksahu.net]\n", + " ... \n", + "10746945 [telegra.ph]\n", + "10746950 [twitter.com, urbanfoodpolicy.com]\n", + "10746955 [openlearning.com]\n", + "10746984 [panaximco.vn]\n", + "10746987 [swansea.ac.uk]\n", + "Name: url_domains, Length: 688572, dtype: object" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['url_domains'].str.len() != 0]['url_domains']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother nameskeywordsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employmentsn_ext_idsemail_domainsurl_domains
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[\"Data science \", \"science of science\", \"schol...37111451[isti.cnr.it][github.io, twitter.com, linkedin.com]
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "8840413 0000-0002-5193-7851 True True True \n", + "\n", + " given names family name biography other names \\\n", + "8840413 Andrea Mannocci \n", + "\n", + " keywords number of works \\\n", + "8840413 [\"Data science \", \"science of science\", \"schol... 37 \n", + "\n", + " Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n", + "8840413 1 1 1 4 \n", + "\n", + " n_employments n_ext_ids email_domains \\\n", + "8840413 5 1 [isti.cnr.it] \n", + "\n", + " url_domains \n", + "8840413 [github.io, twitter.com, linkedin.com] " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == AM]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fixing keywords" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domains
96017050000-0002-8588-4196TrueTrueTruePedroPríncipePedro Príncipe is an information, documentatio...[\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"][][][open access, open science, libraries, reposit...[[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]<NA>[[\"Librarian / Project manager\", \"Universidade...5[\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...[][]
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "9601705 0000-0002-8588-4196 True True True \n", + "\n", + " given names family name \\\n", + "9601705 Pedro Príncipe \n", + "\n", + " biography \\\n", + "9601705 Pedro Príncipe is an information, documentatio... \n", + "\n", + " other names researcher urls \\\n", + "9601705 [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"] [] \n", + "\n", + " primary email other emails \\\n", + "9601705 [] \n", + "\n", + " keywords \\\n", + "9601705 [open access, open science, libraries, reposit... \n", + "\n", + " external identifiers education \\\n", + "9601705 [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]] \n", + "\n", + " employments number of works \\\n", + "9601705 [[\"Librarian / Project manager\", \"Universidade... 5 \n", + "\n", + " works source email_domains \\\n", + "9601705 [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"... [] \n", + "\n", + " url_domains \n", + "9601705 [] " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == PP]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def fix_keywords(lst):\n", + " fixed = []\n", + " for k in lst:\n", + " split = k.split(',')\n", + " fixed.extend(split)\n", + " return fixed" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['open access',\n", + " ' open science',\n", + " ' libraries',\n", + " ' repositories',\n", + " ' social web',\n", + " '']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = ['open access, open science, libraries, repositories, social web,']\n", + "fix_keywords(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
95170990000-0001-6997-9470TrueTrueTrueotherwhatsapp<NA><NA>[[Otherwhatsapp, https://otherwhatsapp.com/], ...[][Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...NaN<NA><NA>0<NA>[][otherwhatsapp.com, im-creator.com, facebook.c...[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...
\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email verified primary email \\\n", + "9517099 0000-0001-6997-9470 True True True \n", + "\n", + " given names family name biography other names \\\n", + "9517099 other whatsapp \n", + "\n", + " researcher urls primary email \\\n", + "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... \n", + "\n", + " other emails keywords \\\n", + "9517099 [] [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n", + "\n", + " external identifiers education employments number of works \\\n", + "9517099 NaN 0 \n", + "\n", + " works source email_domains \\\n", + "9517099 [] \n", + "\n", + " url_domains \\\n", + "9517099 [otherwhatsapp.com, im-creator.com, facebook.c... \n", + "\n", + " fixed_keywords \n", + "9517099 [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == WHATSAPP]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
............................................................
107470350000-0003-4998-1551TrueFalseFalseAnimeshGhosh<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470360000-0003-4998-4111TrueFalseFalseHawaLiberna<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470370000-0003-4998-6045TrueFalseFalseTongyiMen<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470380000-0003-4998-8868TrueTrueFalseCharldonWilken<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470390000-0003-4999-7916TrueTrueTrueTapas BapuB.R.<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
\n", + "

10747040 rows × 19 columns

\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email \\\n", + "0 0000-0001-5000-2053 True False \n", + "1 0000-0001-5000-6548 True False \n", + "2 0000-0001-5000-7962 True True \n", + "3 0000-0001-5000-8586 True False \n", + "4 0000-0001-5001-0256 True False \n", + "... ... ... ... \n", + "10747035 0000-0003-4998-1551 True False \n", + "10747036 0000-0003-4998-4111 True False \n", + "10747037 0000-0003-4998-6045 True False \n", + "10747038 0000-0003-4998-8868 True True \n", + "10747039 0000-0003-4999-7916 True True \n", + "\n", + " verified primary email given names family name biography \\\n", + "0 False Jorge Jaramillo Sanchez \n", + "1 False Wiseman Bekelesi \n", + "2 True ALICE INDIMULI \n", + "3 False shim ji yun \n", + "4 False Sandro Caramaschi \n", + "... ... ... ... ... \n", + "10747035 False Animesh Ghosh \n", + "10747036 False Hawa Liberna \n", + "10747037 False Tongyi Men \n", + "10747038 False Charldon Wilken \n", + "10747039 True Tapas Bapu B.R. \n", + "\n", + " other names researcher urls primary email other emails \\\n", + "0 [] [] \n", + "1 [] [] \n", + "2 [] [] \n", + "3 [] [] \n", + "4 [] [] \n", + "... ... ... ... ... \n", + "10747035 [] [] \n", + "10747036 [] [] \n", + "10747037 [] [] \n", + "10747038 [] [] \n", + "10747039 [] [] \n", + "\n", + " external identifiers education employments number of works \\\n", + "0 NaN 0 \n", + "1 NaN 0 \n", + "2 NaN 0 \n", + "3 NaN 0 \n", + "4 NaN 0 \n", + "... ... ... ... ... \n", + "10747035 NaN 0 \n", + "10747036 NaN 0 \n", + "10747037 NaN 0 \n", + "10747038 NaN 0 \n", + "10747039 NaN 0 \n", + "\n", + " works source email_domains url_domains fixed_keywords \n", + "0 [] [] [] \n", + "1 [] [] [] \n", + "2 [] [] [] \n", + "3 [] [] [] \n", + "4 [] [] [] \n", + "... ... ... ... ... \n", + "10747035 [] [] [] \n", + "10747036 [] [] [] \n", + "10747037 [] [] [] \n", + "10747038 [] [] [] \n", + "10747039 [] [] [] \n", + "\n", + "[10747040 rows x 19 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop('keywords', axis=1, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fixes for other columns with lists inside" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))\n", + "# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", + "# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", + "# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", + "# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", + "# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", + "# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", + "# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature extraction" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# df['email_encoding'] = mlb.fit_transform(df['email_domains'])\n", + "# df['url_encoding'] = mlb.fit_transform(df['url_domains'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA>[][][][][][]0[][][][]
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA>[][][][][][]0[][][][]
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA>[][][][][][]0[][][][]
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA>[][][][][][]0[][][][]
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA>[][][][][][]0[][][][]
...............................................................
107470350000-0003-4998-1551TrueFalseFalseAnimeshGhosh<NA><NA>[][][][][][]0[][][][]
107470360000-0003-4998-4111TrueFalseFalseHawaLiberna<NA><NA>[][][][][][]0[][][][]
107470370000-0003-4998-6045TrueFalseFalseTongyiMen<NA><NA>[][][][][][]0[][][][]
107470380000-0003-4998-8868TrueTrueFalseCharldonWilken<NA><NA>[][][][][][]0[][][][]
107470390000-0003-4999-7916TrueTrueTrueTapas BapuB.R.<NA><NA>[][][][][][]0[][][][]
\n", + "

10747040 rows × 20 columns

\n", + "
" + ], + "text/plain": [ + " orcid claimed verifyed email \\\n", + "0 0000-0001-5000-2053 True False \n", + "1 0000-0001-5000-6548 True False \n", + "2 0000-0001-5000-7962 True True \n", + "3 0000-0001-5000-8586 True False \n", + "4 0000-0001-5001-0256 True False \n", + "... ... ... ... \n", + "10747035 0000-0003-4998-1551 True False \n", + "10747036 0000-0003-4998-4111 True False \n", + "10747037 0000-0003-4998-6045 True False \n", + "10747038 0000-0003-4998-8868 True True \n", + "10747039 0000-0003-4999-7916 True True \n", + "\n", + " verified primary email given names family name biography \\\n", + "0 False Jorge Jaramillo Sanchez \n", + "1 False Wiseman Bekelesi \n", + "2 True ALICE INDIMULI \n", + "3 False shim ji yun \n", + "4 False Sandro Caramaschi \n", + "... ... ... ... ... \n", + "10747035 False Animesh Ghosh \n", + "10747036 False Hawa Liberna \n", + "10747037 False Tongyi Men \n", + "10747038 False Charldon Wilken \n", + "10747039 True Tapas Bapu B.R. \n", + "\n", + " other names researcher urls primary email other emails keywords \\\n", + "0 [] [] [] \n", + "1 [] [] [] \n", + "2 [] [] [] \n", + "3 [] [] [] \n", + "4 [] [] [] \n", + "... ... ... ... ... ... \n", + "10747035 [] [] [] \n", + "10747036 [] [] [] \n", + "10747037 [] [] [] \n", + "10747038 [] [] [] \n", + "10747039 [] [] [] \n", + "\n", + " external identifiers education employments number of works \\\n", + "0 [] [] [] 0 \n", + "1 [] [] [] 0 \n", + "2 [] [] [] 0 \n", + "3 [] [] [] 0 \n", + "4 [] [] [] 0 \n", + "... ... ... ... ... \n", + "10747035 [] [] [] 0 \n", + "10747036 [] [] [] 0 \n", + "10747037 [] [] [] 0 \n", + "10747038 [] [] [] 0 \n", + "10747039 [] [] [] 0 \n", + "\n", + " works source email_domains url_domains fixed_keywords \n", + "0 [] [] [] [] \n", + "1 [] [] [] [] \n", + "2 [] [] [] [] \n", + "3 [] [] [] [] \n", + "4 [] [] [] [] \n", + "... ... ... ... ... \n", + "10747035 [] [] [] [] \n", + "10747036 [] [] [] [] \n", + "10747037 [] [] [] [] \n", + "10747038 [] [] [] [] \n", + "10747039 [] [] [] [] \n", + "\n", + "[10747040 rows x 20 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/references/.gitkeep b/references/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reports/.gitkeep b/reports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reports/figures/.gitkeep b/reports/figures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d4f7d11 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# local package +-e . + +# external requirements +click +Sphinx +coverage +awscli +flake8 +python-dotenv>=0.5.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..570ed7b --- /dev/null +++ b/setup.py @@ -0,0 +1,10 @@ +from setuptools import find_packages, setup + +setup( + name='src', + packages=find_packages(), + version='0.1.0', + description='A short description of the project.', + author='Andrea Mannocci', + license='MIT', +) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/.gitkeep b/src/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py new file mode 100644 index 0000000..96b377a --- /dev/null +++ b/src/data/make_dataset.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +import click +import logging +from pathlib import Path +from dotenv import find_dotenv, load_dotenv + + +@click.command() +@click.argument('input_filepath', type=click.Path(exists=True)) +@click.argument('output_filepath', type=click.Path()) +def main(input_filepath, output_filepath): + """ Runs data processing scripts to turn raw data from (../raw) into + cleaned data ready to be analyzed (saved in ../processed). + """ + logger = logging.getLogger(__name__) + logger.info('making final data set from raw data') + + +if __name__ == '__main__': + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + + # not used in this stub but often useful for finding various files + project_dir = Path(__file__).resolve().parents[2] + + # find .env automagically by walking up directories until it's found, then + # load up the .env entries as environment variables + load_dotenv(find_dotenv()) + + main() diff --git a/src/features/.gitkeep b/src/features/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/features/__init__.py b/src/features/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/features/build_features.py b/src/features/build_features.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/.gitkeep b/src/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/predict_model.py b/src/models/predict_model.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/train_model.py b/src/models/train_model.py new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/.gitkeep b/src/visualization/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/__init__.py b/src/visualization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/visualize.py b/src/visualization/visualize.py new file mode 100644 index 0000000..e69de29 diff --git a/test_environment.py b/test_environment.py new file mode 100644 index 0000000..d0ac4a7 --- /dev/null +++ b/test_environment.py @@ -0,0 +1,25 @@ +import sys + +REQUIRED_PYTHON = "python3" + + +def main(): + system_major = sys.version_info.major + if REQUIRED_PYTHON == "python": + required_major = 2 + elif REQUIRED_PYTHON == "python3": + required_major = 3 + else: + raise ValueError("Unrecognized python interpreter: {}".format( + REQUIRED_PYTHON)) + + if system_major != required_major: + raise TypeError( + "This project requires Python {}. Found: Python {}".format( + required_major, sys.version)) + else: + print(">>> Development environment passes all tests!") + + +if __name__ == '__main__': + main() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..c32fbd8 --- /dev/null +++ b/tox.ini @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 79 +max-complexity = 10