fake-orcid-analysis/notebooks/03-Feature extraction.ipynb

2423 lines
82 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Todo in data\n",
"- Column names -> no space\n",
"- If a list is empty, serialise [] in the csv\n",
"- If a string is empty, serialise '' in the csv"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"from urllib.parse import urlparse\n",
"import tldextract\n",
"\n",
"import pandas as pd\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"mlb = MultiLabelBinarizer()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Notable Solid ORCID iDs for debug purposes\n",
"AM = '0000-0002-5193-7851'\n",
"PP = '0000-0002-8588-4196'\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Notable fake ORCID iDs for debug purposes\n",
"SCAFFOLD = '0000-0001-5004-7761'\n",
"WHATSAPP = '0000-0001-6997-9470'\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header = 0,\n",
" dtype = {\"orcid\": pd.StringDtype(), \n",
" \"claimed\": bool, \n",
" \"verifyed email\": bool, \n",
" \"verified primary email\": bool,\n",
" \"given names\": pd.StringDtype(),\n",
" \"family name\": pd.StringDtype(),\n",
" \"biography\": pd.StringDtype(),\n",
" \"other names\": pd.StringDtype(),\n",
" \"researcher urls\": pd.StringDtype(),\n",
" \"primary email\": pd.StringDtype(),\n",
" \"other emails\": pd.StringDtype(),\n",
" \"keywords\": pd.StringDtype(),\n",
" \"eternal identifiers\": pd.StringDtype(),\n",
" \"education\": pd.StringDtype(),\n",
" \"employments\": pd.StringDtype(),\n",
" \"number of works\": pd.Int16Dtype(),\n",
" \"works source\": pd.StringDtype()})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000-0001-5000-2053</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Jorge</td>\n",
" <td>Jaramillo Sanchez</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000-0001-5000-6548</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Wiseman</td>\n",
" <td>Bekelesi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000-0001-5000-7962</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>ALICE</td>\n",
" <td>INDIMULI</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000-0001-5000-8586</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>shim</td>\n",
" <td>ji yun</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000-0001-5001-0256</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Sandro</td>\n",
" <td>Caramaschi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"0 0000-0001-5000-2053 True False False \n",
"1 0000-0001-5000-6548 True False False \n",
"2 0000-0001-5000-7962 True True True \n",
"3 0000-0001-5000-8586 True False False \n",
"4 0000-0001-5001-0256 True False False \n",
"\n",
" given names family name biography other names researcher urls \\\n",
"0 Jorge Jaramillo Sanchez <NA> <NA> <NA> \n",
"1 Wiseman Bekelesi <NA> <NA> <NA> \n",
"2 ALICE INDIMULI <NA> <NA> <NA> \n",
"3 shim ji yun <NA> <NA> <NA> \n",
"4 Sandro Caramaschi <NA> <NA> <NA> \n",
"\n",
" primary email other emails keywords external identifiers education \\\n",
"0 <NA> <NA> <NA> NaN <NA> \n",
"1 <NA> <NA> <NA> NaN <NA> \n",
"2 <NA> <NA> <NA> NaN <NA> \n",
"3 <NA> <NA> <NA> NaN <NA> \n",
"4 <NA> <NA> <NA> NaN <NA> \n",
"\n",
" employments number of works works source \n",
"0 <NA> 0 <NA> \n",
"1 <NA> 0 <NA> \n",
"2 <NA> 0 <NA> \n",
"3 <NA> 0 <NA> \n",
"4 <NA> 0 <NA> "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>[[\"Information engineering\", \"Ph.D.\", \"Univers...</td>\n",
" <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
" <td>37</td>\n",
" <td>[\"Scopus - Elsevier\", \"Crossref Metadata Searc...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... \n",
"\n",
" external identifiers \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
"\n",
" education \\\n",
"8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n",
"\n",
" employments number of works \\\n",
"8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
"\n",
" works source \n",
"8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting works source"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def extract_work_source(lst):\n",
" extracted = []\n",
" for s in lst:\n",
" if 'Scopus - Elsevier' in s or 'Crossref' in s:\n",
" extracted.append(s)\n",
" return extracted"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>[[\"Information engineering\", \"Ph.D.\", \"Univers...</td>\n",
" <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... \n",
"\n",
" external identifiers \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
"\n",
" education \\\n",
"8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n",
"\n",
" employments number of works \\\n",
"8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
"\n",
" Crossref Crossref Metadata Search Scopus - Elsevier \n",
"8840413 1 1 1 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Education"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df['n_education'] = df['education'].str.len()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df.drop('education', axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" <th>n_education</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... \n",
"\n",
" external identifiers \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
"\n",
" employments number of works \\\n",
"8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
"\n",
" Crossref Crossref Metadata Search Scopus - Elsevier n_education \n",
"8840413 1 1 1 4 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Employment"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"df['n_employments'] = df['employments'].str.len()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"df.drop('employments', axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" <th>n_education</th>\n",
" <th>n_employments</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... \n",
"\n",
" external identifiers number of works Crossref \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] 37 1 \n",
"\n",
" Crossref Metadata Search Scopus - Elsevier n_education \\\n",
"8840413 1 1 4 \n",
"\n",
" n_employments \n",
"8840413 5 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# External IDs"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# def extract_ids(lst):\n",
"# extracted = []\n",
"# for id in lst:\n",
"# extracted.append(id[0])\n",
"# return extracted"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"df['n_ext_ids'] = df['external identifiers'].str.len()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"df.drop(['external identifiers'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" <th>n_education</th>\n",
" <th>n_employments</th>\n",
" <th>n_ext_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords number of works \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... 37 \n",
"\n",
" Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n",
"8840413 1 1 1 4 \n",
"\n",
" n_employments n_ext_ids \n",
"8840413 5 1 "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting email domains"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"df['primary email'] = df['primary email'].fillna('')\n",
"df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def extract_email_domains(row):\n",
" domains = []\n",
" if len(row['primary email']) > 0:\n",
" domains.append(row['primary email'].split('@')[1])\n",
" for email in row['other emails']:\n",
" domains.append(email.split('@')[1])\n",
" return domains"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"34 [seh.ox.ac.uk, bsg.ox.ac.uk]\n",
"47 [foxmail.com]\n",
"103 [fvtm.bu.edu.eg]\n",
"297 [unipa.it]\n",
"299 [nhs.net]\n",
" ... \n",
"10746811 [gva.es, gmail.com]\n",
"10746850 [cinvestav.mx]\n",
"10746920 [gmail.com, hotmail.com]\n",
"10746975 [mail.ru]\n",
"10746988 [ucm.es]\n",
"Name: email_domains, Length: 141118, dtype: object"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['email_domains'].str.len() != 0]['email_domains']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting URL domains"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def extract_url_domains(lst):\n",
" domains = []\n",
" for e in lst:\n",
" # e[0] is a string describing the url\n",
" # e[1] is the url\n",
" ext = tldextract.extract(e[1])\n",
" domains.append(ext.registered_domain)\n",
" return domains"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5 [researchgate.net]\n",
"14 [tigerscaffolds.co.nz]\n",
"15 [corticalbrain.com]\n",
"29 [cnpq.br]\n",
"30 [sksahu.net]\n",
" ... \n",
"10746945 [telegra.ph]\n",
"10746950 [twitter.com, urbanfoodpolicy.com]\n",
"10746955 [openlearning.com]\n",
"10746984 [panaximco.vn]\n",
"10746987 [swansea.ac.uk]\n",
"Name: url_domains, Length: 688572, dtype: object"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['url_domains'].str.len() != 0]['url_domains']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>keywords</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" <th>n_education</th>\n",
" <th>n_employments</th>\n",
" <th>n_ext_ids</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>[isti.cnr.it]</td>\n",
" <td>[github.io, twitter.com, linkedin.com]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" keywords number of works \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... 37 \n",
"\n",
" Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n",
"8840413 1 1 1 4 \n",
"\n",
" n_employments n_ext_ids email_domains \\\n",
"8840413 5 1 [isti.cnr.it] \n",
"\n",
" url_domains \n",
"8840413 [github.io, twitter.com, linkedin.com] "
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fixing keywords"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9601705</th>\n",
" <td>0000-0002-8588-4196</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Pedro</td>\n",
" <td>Príncipe</td>\n",
" <td>Pedro Príncipe is an information, documentatio...</td>\n",
" <td>[\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"]</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[open access, open science, libraries, reposit...</td>\n",
" <td>[[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Librarian / Project manager\", \"Universidade...</td>\n",
" <td>5</td>\n",
" <td>[\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"9601705 0000-0002-8588-4196 True True True \n",
"\n",
" given names family name \\\n",
"9601705 Pedro Príncipe \n",
"\n",
" biography \\\n",
"9601705 Pedro Príncipe is an information, documentatio... \n",
"\n",
" other names researcher urls \\\n",
"9601705 [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"] [] \n",
"\n",
" primary email other emails \\\n",
"9601705 [] \n",
"\n",
" keywords \\\n",
"9601705 [open access, open science, libraries, reposit... \n",
"\n",
" external identifiers education \\\n",
"9601705 [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]] <NA> \n",
"\n",
" employments number of works \\\n",
"9601705 [[\"Librarian / Project manager\", \"Universidade... 5 \n",
"\n",
" works source email_domains \\\n",
"9601705 [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"... [] \n",
"\n",
" url_domains \n",
"9601705 [] "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == PP]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def fix_keywords(lst):\n",
" fixed = []\n",
" for k in lst:\n",
" split = k.split(',')\n",
" fixed.extend(split)\n",
" return fixed"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['open access',\n",
" ' open science',\n",
" ' libraries',\n",
" ' repositories',\n",
" ' social web',\n",
" '']"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test = ['open access, open science, libraries, repositories, social web,']\n",
"fix_keywords(test)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" <th>fixed_keywords</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9517099</th>\n",
" <td>0000-0001-6997-9470</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>other</td>\n",
" <td>whatsapp</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[Otherwhatsapp, https://otherwhatsapp.com/], ...</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[otherwhatsapp.com, im-creator.com, facebook.c...</td>\n",
" <td>[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"9517099 0000-0001-6997-9470 True True True \n",
"\n",
" given names family name biography other names \\\n",
"9517099 other whatsapp <NA> <NA> \n",
"\n",
" researcher urls primary email \\\n",
"9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... \n",
"\n",
" other emails keywords \\\n",
"9517099 [] [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n",
"\n",
" external identifiers education employments number of works \\\n",
"9517099 NaN <NA> <NA> 0 \n",
"\n",
" works source email_domains \\\n",
"9517099 <NA> [] \n",
"\n",
" url_domains \\\n",
"9517099 [otherwhatsapp.com, im-creator.com, facebook.c... \n",
"\n",
" fixed_keywords \n",
"9517099 [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == WHATSAPP]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" <th>fixed_keywords</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000-0001-5000-2053</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Jorge</td>\n",
" <td>Jaramillo Sanchez</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000-0001-5000-6548</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Wiseman</td>\n",
" <td>Bekelesi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000-0001-5000-7962</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>ALICE</td>\n",
" <td>INDIMULI</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000-0001-5000-8586</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>shim</td>\n",
" <td>ji yun</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000-0001-5001-0256</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Sandro</td>\n",
" <td>Caramaschi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747035</th>\n",
" <td>0000-0003-4998-1551</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Animesh</td>\n",
" <td>Ghosh</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747036</th>\n",
" <td>0000-0003-4998-4111</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Hawa</td>\n",
" <td>Liberna</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747037</th>\n",
" <td>0000-0003-4998-6045</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Tongyi</td>\n",
" <td>Men</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747038</th>\n",
" <td>0000-0003-4998-8868</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Charldon</td>\n",
" <td>Wilken</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747039</th>\n",
" <td>0000-0003-4999-7916</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Tapas Bapu</td>\n",
" <td>B.R.</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10747040 rows × 19 columns</p>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email \\\n",
"0 0000-0001-5000-2053 True False \n",
"1 0000-0001-5000-6548 True False \n",
"2 0000-0001-5000-7962 True True \n",
"3 0000-0001-5000-8586 True False \n",
"4 0000-0001-5001-0256 True False \n",
"... ... ... ... \n",
"10747035 0000-0003-4998-1551 True False \n",
"10747036 0000-0003-4998-4111 True False \n",
"10747037 0000-0003-4998-6045 True False \n",
"10747038 0000-0003-4998-8868 True True \n",
"10747039 0000-0003-4999-7916 True True \n",
"\n",
" verified primary email given names family name biography \\\n",
"0 False Jorge Jaramillo Sanchez <NA> \n",
"1 False Wiseman Bekelesi <NA> \n",
"2 True ALICE INDIMULI <NA> \n",
"3 False shim ji yun <NA> \n",
"4 False Sandro Caramaschi <NA> \n",
"... ... ... ... ... \n",
"10747035 False Animesh Ghosh <NA> \n",
"10747036 False Hawa Liberna <NA> \n",
"10747037 False Tongyi Men <NA> \n",
"10747038 False Charldon Wilken <NA> \n",
"10747039 True Tapas Bapu B.R. <NA> \n",
"\n",
" other names researcher urls primary email other emails \\\n",
"0 <NA> [] [] \n",
"1 <NA> [] [] \n",
"2 <NA> [] [] \n",
"3 <NA> [] [] \n",
"4 <NA> [] [] \n",
"... ... ... ... ... \n",
"10747035 <NA> [] [] \n",
"10747036 <NA> [] [] \n",
"10747037 <NA> [] [] \n",
"10747038 <NA> [] [] \n",
"10747039 <NA> [] [] \n",
"\n",
" external identifiers education employments number of works \\\n",
"0 NaN <NA> <NA> 0 \n",
"1 NaN <NA> <NA> 0 \n",
"2 NaN <NA> <NA> 0 \n",
"3 NaN <NA> <NA> 0 \n",
"4 NaN <NA> <NA> 0 \n",
"... ... ... ... ... \n",
"10747035 NaN <NA> <NA> 0 \n",
"10747036 NaN <NA> <NA> 0 \n",
"10747037 NaN <NA> <NA> 0 \n",
"10747038 NaN <NA> <NA> 0 \n",
"10747039 NaN <NA> <NA> 0 \n",
"\n",
" works source email_domains url_domains fixed_keywords \n",
"0 <NA> [] [] [] \n",
"1 <NA> [] [] [] \n",
"2 <NA> [] [] [] \n",
"3 <NA> [] [] [] \n",
"4 <NA> [] [] [] \n",
"... ... ... ... ... \n",
"10747035 <NA> [] [] [] \n",
"10747036 <NA> [] [] [] \n",
"10747037 <NA> [] [] [] \n",
"10747038 <NA> [] [] [] \n",
"10747039 <NA> [] [] [] \n",
"\n",
"[10747040 rows x 19 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop('keywords', axis=1, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fixes for other columns with lists inside"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))\n",
"# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature extraction"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# df['email_encoding'] = mlb.fit_transform(df['email_domains'])\n",
"# df['url_encoding'] = mlb.fit_transform(df['url_domains'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" <th>fixed_keywords</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000-0001-5000-2053</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Jorge</td>\n",
" <td>Jaramillo Sanchez</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000-0001-5000-6548</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Wiseman</td>\n",
" <td>Bekelesi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000-0001-5000-7962</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>ALICE</td>\n",
" <td>INDIMULI</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000-0001-5000-8586</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>shim</td>\n",
" <td>ji yun</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000-0001-5001-0256</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Sandro</td>\n",
" <td>Caramaschi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747035</th>\n",
" <td>0000-0003-4998-1551</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Animesh</td>\n",
" <td>Ghosh</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747036</th>\n",
" <td>0000-0003-4998-4111</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Hawa</td>\n",
" <td>Liberna</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747037</th>\n",
" <td>0000-0003-4998-6045</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Tongyi</td>\n",
" <td>Men</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747038</th>\n",
" <td>0000-0003-4998-8868</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Charldon</td>\n",
" <td>Wilken</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747039</th>\n",
" <td>0000-0003-4999-7916</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Tapas Bapu</td>\n",
" <td>B.R.</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10747040 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email \\\n",
"0 0000-0001-5000-2053 True False \n",
"1 0000-0001-5000-6548 True False \n",
"2 0000-0001-5000-7962 True True \n",
"3 0000-0001-5000-8586 True False \n",
"4 0000-0001-5001-0256 True False \n",
"... ... ... ... \n",
"10747035 0000-0003-4998-1551 True False \n",
"10747036 0000-0003-4998-4111 True False \n",
"10747037 0000-0003-4998-6045 True False \n",
"10747038 0000-0003-4998-8868 True True \n",
"10747039 0000-0003-4999-7916 True True \n",
"\n",
" verified primary email given names family name biography \\\n",
"0 False Jorge Jaramillo Sanchez <NA> \n",
"1 False Wiseman Bekelesi <NA> \n",
"2 True ALICE INDIMULI <NA> \n",
"3 False shim ji yun <NA> \n",
"4 False Sandro Caramaschi <NA> \n",
"... ... ... ... ... \n",
"10747035 False Animesh Ghosh <NA> \n",
"10747036 False Hawa Liberna <NA> \n",
"10747037 False Tongyi Men <NA> \n",
"10747038 False Charldon Wilken <NA> \n",
"10747039 True Tapas Bapu B.R. <NA> \n",
"\n",
" other names researcher urls primary email other emails keywords \\\n",
"0 <NA> [] [] [] \n",
"1 <NA> [] [] [] \n",
"2 <NA> [] [] [] \n",
"3 <NA> [] [] [] \n",
"4 <NA> [] [] [] \n",
"... ... ... ... ... ... \n",
"10747035 <NA> [] [] [] \n",
"10747036 <NA> [] [] [] \n",
"10747037 <NA> [] [] [] \n",
"10747038 <NA> [] [] [] \n",
"10747039 <NA> [] [] [] \n",
"\n",
" external identifiers education employments number of works \\\n",
"0 [] [] [] 0 \n",
"1 [] [] [] 0 \n",
"2 [] [] [] 0 \n",
"3 [] [] [] 0 \n",
"4 [] [] [] 0 \n",
"... ... ... ... ... \n",
"10747035 [] [] [] 0 \n",
"10747036 [] [] [] 0 \n",
"10747037 [] [] [] 0 \n",
"10747038 [] [] [] 0 \n",
"10747039 [] [] [] 0 \n",
"\n",
" works source email_domains url_domains fixed_keywords \n",
"0 [] [] [] [] \n",
"1 [] [] [] [] \n",
"2 [] [] [] [] \n",
"3 [] [] [] [] \n",
"4 [] [] [] [] \n",
"... ... ... ... ... \n",
"10747035 [] [] [] [] \n",
"10747036 [] [] [] [] \n",
"10747037 [] [] [] [] \n",
"10747038 [] [] [] [] \n",
"10747039 [] [] [] [] \n",
"\n",
"[10747040 rows x 20 columns]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}