{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Todo in data\n", "- Column names -> no space\n", "- If a list is empty, serialise [] in the csv\n", "- If a string is empty, serialise '' in the csv" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ast\n", "from urllib.parse import urlparse\n", "import tldextract\n", "\n", "import pandas as pd\n", "from sklearn.preprocessing import MultiLabelBinarizer\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "mlb = MultiLabelBinarizer()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Notable Solid ORCID iDs for debug purposes\n", "AM = '0000-0002-5193-7851'\n", "PP = '0000-0002-8588-4196'\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Notable fake ORCID iDs for debug purposes\n", "SCAFFOLD = '0000-0001-5004-7761'\n", "WHATSAPP = '0000-0001-6997-9470'\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header = 0,\n", " dtype = {\"orcid\": pd.StringDtype(), \n", " \"claimed\": bool, \n", " \"verifyed email\": bool, \n", " \"verified primary email\": bool,\n", " \"given names\": pd.StringDtype(),\n", " \"family name\": pd.StringDtype(),\n", " \"biography\": pd.StringDtype(),\n", " \"other names\": pd.StringDtype(),\n", " \"researcher urls\": pd.StringDtype(),\n", " \"primary email\": pd.StringDtype(),\n", " \"other emails\": pd.StringDtype(),\n", " \"keywords\": pd.StringDtype(),\n", " \"eternal identifiers\": pd.StringDtype(),\n", " \"education\": pd.StringDtype(),\n", " \"employments\": pd.StringDtype(),\n", " \"number of works\": pd.Int16Dtype(),\n", " \"works source\": pd.StringDtype()})" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks source
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "0 0000-0001-5000-2053 True False False \n", "1 0000-0001-5000-6548 True False False \n", "2 0000-0001-5000-7962 True True True \n", "3 0000-0001-5000-8586 True False False \n", "4 0000-0001-5001-0256 True False False \n", "\n", " given names family name biography other names researcher urls \\\n", "0 Jorge Jaramillo Sanchez \n", "1 Wiseman Bekelesi \n", "2 ALICE INDIMULI \n", "3 shim ji yun \n", "4 Sandro Caramaschi \n", "\n", " primary email other emails keywords external identifiers education \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "\n", " employments number of works works source \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(5)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks source
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Information engineering\", \"Ph.D.\", \"Univers...[[\"Research Associate\", \"Istituto di Scienza e...37[\"Scopus - Elsevier\", \"Crossref Metadata Searc...
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "8840413 0000-0002-5193-7851 True True True \n", "\n", " given names family name biography other names \\\n", "8840413 Andrea Mannocci \n", "\n", " researcher urls \\\n", "8840413 [[\"Personal website\", \"https://andremann.githu... \n", "\n", " primary email other emails \\\n", "8840413 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", "8840413 [\"Data science \", \"science of science\", \"schol... \n", "\n", " external identifiers \\\n", "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", "\n", " education \\\n", "8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n", "\n", " employments number of works \\\n", "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", "\n", " works source \n", "8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Extracting works source" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def extract_work_source(lst):\n", " extracted = []\n", " for s in lst:\n", " if 'Scopus - Elsevier' in s or 'Crossref' in s:\n", " extracted.append(s)\n", " return extracted" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksCrossrefCrossref Metadata SearchScopus - Elsevier
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Information engineering\", \"Ph.D.\", \"Univers...[[\"Research Associate\", \"Istituto di Scienza e...37111
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "8840413 0000-0002-5193-7851 True True True \n", "\n", " given names family name biography other names \\\n", "8840413 Andrea Mannocci \n", "\n", " researcher urls \\\n", "8840413 [[\"Personal website\", \"https://andremann.githu... \n", "\n", " primary email other emails \\\n", "8840413 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", "8840413 [\"Data science \", \"science of science\", \"schol... \n", "\n", " external identifiers \\\n", "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", "\n", " education \\\n", "8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n", "\n", " employments number of works \\\n", "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", "\n", " Crossref Crossref Metadata Search Scopus - Elsevier \n", "8840413 1 1 1 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Education" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "df['n_education'] = df['education'].str.len()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "df.drop('education', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifiersemploymentsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_education
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Research Associate\", \"Istituto di Scienza e...371114
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "8840413 0000-0002-5193-7851 True True True \n", "\n", " given names family name biography other names \\\n", "8840413 Andrea Mannocci \n", "\n", " researcher urls \\\n", "8840413 [[\"Personal website\", \"https://andremann.githu... \n", "\n", " primary email other emails \\\n", "8840413 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", "8840413 [\"Data science \", \"science of science\", \"schol... \n", "\n", " external identifiers \\\n", "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", "\n", " employments number of works \\\n", "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", "\n", " Crossref Crossref Metadata Search Scopus - Elsevier n_education \n", "8840413 1 1 1 4 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Employment" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "df['n_employments'] = df['employments'].str.len()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "df.drop('employments', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifiersnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employments
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]]3711145
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "8840413 0000-0002-5193-7851 True True True \n", "\n", " given names family name biography other names \\\n", "8840413 Andrea Mannocci \n", "\n", " researcher urls \\\n", "8840413 [[\"Personal website\", \"https://andremann.githu... \n", "\n", " primary email other emails \\\n", "8840413 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", "8840413 [\"Data science \", \"science of science\", \"schol... \n", "\n", " external identifiers number of works Crossref \\\n", "8840413 [[\"Scopus Author ID\", \"55233589900\"]] 37 1 \n", "\n", " Crossref Metadata Search Scopus - Elsevier n_education \\\n", "8840413 1 1 4 \n", "\n", " n_employments \n", "8840413 5 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# External IDs" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# def extract_ids(lst):\n", "# extracted = []\n", "# for id in lst:\n", "# extracted.append(id[0])\n", "# return extracted" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "df['n_ext_ids'] = df['external identifiers'].str.len()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "df.drop(['external identifiers'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employmentsn_ext_ids
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...37111451
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "8840413 0000-0002-5193-7851 True True True \n", "\n", " given names family name biography other names \\\n", "8840413 Andrea Mannocci \n", "\n", " researcher urls \\\n", "8840413 [[\"Personal website\", \"https://andremann.githu... \n", "\n", " primary email other emails \\\n", "8840413 andrea.mannocci@isti.cnr.it \n", "\n", " keywords number of works \\\n", "8840413 [\"Data science \", \"science of science\", \"schol... 37 \n", "\n", " Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n", "8840413 1 1 1 4 \n", "\n", " n_employments n_ext_ids \n", "8840413 5 1 " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Extracting email domains" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "df['primary email'] = df['primary email'].fillna('')\n", "df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "def extract_email_domains(row):\n", " domains = []\n", " if len(row['primary email']) > 0:\n", " domains.append(row['primary email'].split('@')[1])\n", " for email in row['other emails']:\n", " domains.append(email.split('@')[1])\n", " return domains" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "34 [seh.ox.ac.uk, bsg.ox.ac.uk]\n", "47 [foxmail.com]\n", "103 [fvtm.bu.edu.eg]\n", "297 [unipa.it]\n", "299 [nhs.net]\n", " ... \n", "10746811 [gva.es, gmail.com]\n", "10746850 [cinvestav.mx]\n", "10746920 [gmail.com, hotmail.com]\n", "10746975 [mail.ru]\n", "10746988 [ucm.es]\n", "Name: email_domains, Length: 141118, dtype: object" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['email_domains'].str.len() != 0]['email_domains']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Extracting URL domains" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "def extract_url_domains(lst):\n", " domains = []\n", " for e in lst:\n", " # e[0] is a string describing the url\n", " # e[1] is the url\n", " ext = tldextract.extract(e[1])\n", " domains.append(ext.registered_domain)\n", " return domains" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5 [researchgate.net]\n", "14 [tigerscaffolds.co.nz]\n", "15 [corticalbrain.com]\n", "29 [cnpq.br]\n", "30 [sksahu.net]\n", " ... \n", "10746945 [telegra.ph]\n", "10746950 [twitter.com, urbanfoodpolicy.com]\n", "10746955 [openlearning.com]\n", "10746984 [panaximco.vn]\n", "10746987 [swansea.ac.uk]\n", "Name: url_domains, Length: 688572, dtype: object" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['url_domains'].str.len() != 0]['url_domains']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother nameskeywordsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employmentsn_ext_idsemail_domainsurl_domains
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[\"Data science \", \"science of science\", \"schol...37111451[isti.cnr.it][github.io, twitter.com, linkedin.com]
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "8840413 0000-0002-5193-7851 True True True \n", "\n", " given names family name biography other names \\\n", "8840413 Andrea Mannocci \n", "\n", " keywords number of works \\\n", "8840413 [\"Data science \", \"science of science\", \"schol... 37 \n", "\n", " Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n", "8840413 1 1 1 4 \n", "\n", " n_employments n_ext_ids email_domains \\\n", "8840413 5 1 [isti.cnr.it] \n", "\n", " url_domains \n", "8840413 [github.io, twitter.com, linkedin.com] " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fixing keywords" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domains
96017050000-0002-8588-4196TrueTrueTruePedroPríncipePedro Príncipe is an information, documentatio...[\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"][][][open access, open science, libraries, reposit...[[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]<NA>[[\"Librarian / Project manager\", \"Universidade...5[\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...[][]
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "9601705 0000-0002-8588-4196 True True True \n", "\n", " given names family name \\\n", "9601705 Pedro Príncipe \n", "\n", " biography \\\n", "9601705 Pedro Príncipe is an information, documentatio... \n", "\n", " other names researcher urls \\\n", "9601705 [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"] [] \n", "\n", " primary email other emails \\\n", "9601705 [] \n", "\n", " keywords \\\n", "9601705 [open access, open science, libraries, reposit... \n", "\n", " external identifiers education \\\n", "9601705 [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]] \n", "\n", " employments number of works \\\n", "9601705 [[\"Librarian / Project manager\", \"Universidade... 5 \n", "\n", " works source email_domains \\\n", "9601705 [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"... [] \n", "\n", " url_domains \n", "9601705 [] " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == PP]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "def fix_keywords(lst):\n", " fixed = []\n", " for k in lst:\n", " split = k.split(',')\n", " fixed.extend(split)\n", " return fixed" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['open access',\n", " ' open science',\n", " ' libraries',\n", " ' repositories',\n", " ' social web',\n", " '']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test = ['open access, open science, libraries, repositories, social web,']\n", "fix_keywords(test)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
95170990000-0001-6997-9470TrueTrueTrueotherwhatsapp<NA><NA>[[Otherwhatsapp, https://otherwhatsapp.com/], ...[][Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...NaN<NA><NA>0<NA>[][otherwhatsapp.com, im-creator.com, facebook.c...[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...
\n", "
" ], "text/plain": [ " orcid claimed verifyed email verified primary email \\\n", "9517099 0000-0001-6997-9470 True True True \n", "\n", " given names family name biography other names \\\n", "9517099 other whatsapp \n", "\n", " researcher urls primary email \\\n", "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... \n", "\n", " other emails keywords \\\n", "9517099 [] [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n", "\n", " external identifiers education employments number of works \\\n", "9517099 NaN 0 \n", "\n", " works source email_domains \\\n", "9517099 [] \n", "\n", " url_domains \\\n", "9517099 [otherwhatsapp.com, im-creator.com, facebook.c... \n", "\n", " fixed_keywords \n", "9517099 [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == WHATSAPP]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
............................................................
107470350000-0003-4998-1551TrueFalseFalseAnimeshGhosh<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470360000-0003-4998-4111TrueFalseFalseHawaLiberna<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470370000-0003-4998-6045TrueFalseFalseTongyiMen<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470380000-0003-4998-8868TrueTrueFalseCharldonWilken<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470390000-0003-4999-7916TrueTrueTrueTapas BapuB.R.<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
\n", "

10747040 rows × 19 columns

\n", "
" ], "text/plain": [ " orcid claimed verifyed email \\\n", "0 0000-0001-5000-2053 True False \n", "1 0000-0001-5000-6548 True False \n", "2 0000-0001-5000-7962 True True \n", "3 0000-0001-5000-8586 True False \n", "4 0000-0001-5001-0256 True False \n", "... ... ... ... \n", "10747035 0000-0003-4998-1551 True False \n", "10747036 0000-0003-4998-4111 True False \n", "10747037 0000-0003-4998-6045 True False \n", "10747038 0000-0003-4998-8868 True True \n", "10747039 0000-0003-4999-7916 True True \n", "\n", " verified primary email given names family name biography \\\n", "0 False Jorge Jaramillo Sanchez \n", "1 False Wiseman Bekelesi \n", "2 True ALICE INDIMULI \n", "3 False shim ji yun \n", "4 False Sandro Caramaschi \n", "... ... ... ... ... \n", "10747035 False Animesh Ghosh \n", "10747036 False Hawa Liberna \n", "10747037 False Tongyi Men \n", "10747038 False Charldon Wilken \n", "10747039 True Tapas Bapu B.R. \n", "\n", " other names researcher urls primary email other emails \\\n", "0 [] [] \n", "1 [] [] \n", "2 [] [] \n", "3 [] [] \n", "4 [] [] \n", "... ... ... ... ... \n", "10747035 [] [] \n", "10747036 [] [] \n", "10747037 [] [] \n", "10747038 [] [] \n", "10747039 [] [] \n", "\n", " external identifiers education employments number of works \\\n", "0 NaN 0 \n", "1 NaN 0 \n", "2 NaN 0 \n", "3 NaN 0 \n", "4 NaN 0 \n", "... ... ... ... ... \n", "10747035 NaN 0 \n", "10747036 NaN 0 \n", "10747037 NaN 0 \n", "10747038 NaN 0 \n", "10747039 NaN 0 \n", "\n", " works source email_domains url_domains fixed_keywords \n", "0 [] [] [] \n", "1 [] [] [] \n", "2 [] [] [] \n", "3 [] [] [] \n", "4 [] [] [] \n", "... ... ... ... ... \n", "10747035 [] [] [] \n", "10747036 [] [] [] \n", "10747037 [] [] [] \n", "10747038 [] [] [] \n", "10747039 [] [] [] \n", "\n", "[10747040 rows x 19 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop('keywords', axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fixes for other columns with lists inside" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))\n", "# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", "# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", "# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", "# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", "# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", "# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", "# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature extraction" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# df['email_encoding'] = mlb.fit_transform(df['email_domains'])\n", "# df['url_encoding'] = mlb.fit_transform(df['url_domains'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA>[][][][][][]0[][][][]
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA>[][][][][][]0[][][][]
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA>[][][][][][]0[][][][]
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA>[][][][][][]0[][][][]
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA>[][][][][][]0[][][][]
...............................................................
107470350000-0003-4998-1551TrueFalseFalseAnimeshGhosh<NA><NA>[][][][][][]0[][][][]
107470360000-0003-4998-4111TrueFalseFalseHawaLiberna<NA><NA>[][][][][][]0[][][][]
107470370000-0003-4998-6045TrueFalseFalseTongyiMen<NA><NA>[][][][][][]0[][][][]
107470380000-0003-4998-8868TrueTrueFalseCharldonWilken<NA><NA>[][][][][][]0[][][][]
107470390000-0003-4999-7916TrueTrueTrueTapas BapuB.R.<NA><NA>[][][][][][]0[][][][]
\n", "

10747040 rows × 20 columns

\n", "
" ], "text/plain": [ " orcid claimed verifyed email \\\n", "0 0000-0001-5000-2053 True False \n", "1 0000-0001-5000-6548 True False \n", "2 0000-0001-5000-7962 True True \n", "3 0000-0001-5000-8586 True False \n", "4 0000-0001-5001-0256 True False \n", "... ... ... ... \n", "10747035 0000-0003-4998-1551 True False \n", "10747036 0000-0003-4998-4111 True False \n", "10747037 0000-0003-4998-6045 True False \n", "10747038 0000-0003-4998-8868 True True \n", "10747039 0000-0003-4999-7916 True True \n", "\n", " verified primary email given names family name biography \\\n", "0 False Jorge Jaramillo Sanchez \n", "1 False Wiseman Bekelesi \n", "2 True ALICE INDIMULI \n", "3 False shim ji yun \n", "4 False Sandro Caramaschi \n", "... ... ... ... ... \n", "10747035 False Animesh Ghosh \n", "10747036 False Hawa Liberna \n", "10747037 False Tongyi Men \n", "10747038 False Charldon Wilken \n", "10747039 True Tapas Bapu B.R. \n", "\n", " other names researcher urls primary email other emails keywords \\\n", "0 [] [] [] \n", "1 [] [] [] \n", "2 [] [] [] \n", "3 [] [] [] \n", "4 [] [] [] \n", "... ... ... ... ... ... \n", "10747035 [] [] [] \n", "10747036 [] [] [] \n", "10747037 [] [] [] \n", "10747038 [] [] [] \n", "10747039 [] [] [] \n", "\n", " external identifiers education employments number of works \\\n", "0 [] [] [] 0 \n", "1 [] [] [] 0 \n", "2 [] [] [] 0 \n", "3 [] [] [] 0 \n", "4 [] [] [] 0 \n", "... ... ... ... ... \n", "10747035 [] [] [] 0 \n", "10747036 [] [] [] 0 \n", "10747037 [] [] [] 0 \n", "10747038 [] [] [] 0 \n", "10747039 [] [] [] 0 \n", "\n", " works source email_domains url_domains fixed_keywords \n", "0 [] [] [] [] \n", "1 [] [] [] [] \n", "2 [] [] [] [] \n", "3 [] [] [] [] \n", "4 [] [] [] [] \n", "... ... ... ... ... \n", "10747035 [] [] [] [] \n", "10747036 [] [] [] [] \n", "10747037 [] [] [] [] \n", "10747038 [] [] [] [] \n", "10747039 [] [] [] [] \n", "\n", "[10747040 rows x 20 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }