fake-orcid-analysis/notebooks/03-Feature extraction.ipynb

2423 lines
82 KiB
Plaintext
Raw Normal View History

2021-03-18 17:43:00 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Todo in data\n",
"- Column names -> no space\n",
"- If a list is empty, serialise [] in the csv\n",
"- If a string is empty, serialise '' in the csv"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"from urllib.parse import urlparse\n",
"import tldextract\n",
"\n",
"import pandas as pd\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"mlb = MultiLabelBinarizer()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Notable Solid ORCID iDs for debug purposes\n",
"AM = '0000-0002-5193-7851'\n",
"PP = '0000-0002-8588-4196'\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Notable fake ORCID iDs for debug purposes\n",
"SCAFFOLD = '0000-0001-5004-7761'\n",
"WHATSAPP = '0000-0001-6997-9470'\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header = 0,\n",
" dtype = {\"orcid\": pd.StringDtype(), \n",
" \"claimed\": bool, \n",
" \"verifyed email\": bool, \n",
" \"verified primary email\": bool,\n",
" \"given names\": pd.StringDtype(),\n",
" \"family name\": pd.StringDtype(),\n",
" \"biography\": pd.StringDtype(),\n",
" \"other names\": pd.StringDtype(),\n",
" \"researcher urls\": pd.StringDtype(),\n",
" \"primary email\": pd.StringDtype(),\n",
" \"other emails\": pd.StringDtype(),\n",
" \"keywords\": pd.StringDtype(),\n",
" \"eternal identifiers\": pd.StringDtype(),\n",
" \"education\": pd.StringDtype(),\n",
" \"employments\": pd.StringDtype(),\n",
" \"number of works\": pd.Int16Dtype(),\n",
" \"works source\": pd.StringDtype()})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000-0001-5000-2053</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Jorge</td>\n",
" <td>Jaramillo Sanchez</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000-0001-5000-6548</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Wiseman</td>\n",
" <td>Bekelesi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000-0001-5000-7962</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>ALICE</td>\n",
" <td>INDIMULI</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000-0001-5000-8586</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>shim</td>\n",
" <td>ji yun</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000-0001-5001-0256</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Sandro</td>\n",
" <td>Caramaschi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"0 0000-0001-5000-2053 True False False \n",
"1 0000-0001-5000-6548 True False False \n",
"2 0000-0001-5000-7962 True True True \n",
"3 0000-0001-5000-8586 True False False \n",
"4 0000-0001-5001-0256 True False False \n",
"\n",
" given names family name biography other names researcher urls \\\n",
"0 Jorge Jaramillo Sanchez <NA> <NA> <NA> \n",
"1 Wiseman Bekelesi <NA> <NA> <NA> \n",
"2 ALICE INDIMULI <NA> <NA> <NA> \n",
"3 shim ji yun <NA> <NA> <NA> \n",
"4 Sandro Caramaschi <NA> <NA> <NA> \n",
"\n",
" primary email other emails keywords external identifiers education \\\n",
"0 <NA> <NA> <NA> NaN <NA> \n",
"1 <NA> <NA> <NA> NaN <NA> \n",
"2 <NA> <NA> <NA> NaN <NA> \n",
"3 <NA> <NA> <NA> NaN <NA> \n",
"4 <NA> <NA> <NA> NaN <NA> \n",
"\n",
" employments number of works works source \n",
"0 <NA> 0 <NA> \n",
"1 <NA> 0 <NA> \n",
"2 <NA> 0 <NA> \n",
"3 <NA> 0 <NA> \n",
"4 <NA> 0 <NA> "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>[[\"Information engineering\", \"Ph.D.\", \"Univers...</td>\n",
" <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
" <td>37</td>\n",
" <td>[\"Scopus - Elsevier\", \"Crossref Metadata Searc...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... \n",
"\n",
" external identifiers \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
"\n",
" education \\\n",
"8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n",
"\n",
" employments number of works \\\n",
"8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
"\n",
" works source \n",
"8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting works source"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def extract_work_source(lst):\n",
" extracted = []\n",
" for s in lst:\n",
" if 'Scopus - Elsevier' in s or 'Crossref' in s:\n",
" extracted.append(s)\n",
" return extracted"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>[[\"Information engineering\", \"Ph.D.\", \"Univers...</td>\n",
" <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... \n",
"\n",
" external identifiers \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
"\n",
" education \\\n",
"8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n",
"\n",
" employments number of works \\\n",
"8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
"\n",
" Crossref Crossref Metadata Search Scopus - Elsevier \n",
"8840413 1 1 1 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Education"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df['n_education'] = df['education'].str.len()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df.drop('education', axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" <th>n_education</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... \n",
"\n",
" external identifiers \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
"\n",
" employments number of works \\\n",
"8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
"\n",
" Crossref Crossref Metadata Search Scopus - Elsevier n_education \n",
"8840413 1 1 1 4 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Employment"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"df['n_employments'] = df['employments'].str.len()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"df.drop('employments', axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" <th>n_education</th>\n",
" <th>n_employments</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... \n",
"\n",
" external identifiers number of works Crossref \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] 37 1 \n",
"\n",
" Crossref Metadata Search Scopus - Elsevier n_education \\\n",
"8840413 1 1 4 \n",
"\n",
" n_employments \n",
"8840413 5 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# External IDs"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# def extract_ids(lst):\n",
"# extracted = []\n",
"# for id in lst:\n",
"# extracted.append(id[0])\n",
"# return extracted"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"df['n_ext_ids'] = df['external identifiers'].str.len()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"df.drop(['external identifiers'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" <th>n_education</th>\n",
" <th>n_employments</th>\n",
" <th>n_ext_ids</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" researcher urls \\\n",
"8840413 [[\"Personal website\", \"https://andremann.githu... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it <NA> \n",
"\n",
" keywords number of works \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... 37 \n",
"\n",
" Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n",
"8840413 1 1 1 4 \n",
"\n",
" n_employments n_ext_ids \n",
"8840413 5 1 "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting email domains"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"df['primary email'] = df['primary email'].fillna('')\n",
"df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def extract_email_domains(row):\n",
" domains = []\n",
" if len(row['primary email']) > 0:\n",
" domains.append(row['primary email'].split('@')[1])\n",
" for email in row['other emails']:\n",
" domains.append(email.split('@')[1])\n",
" return domains"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"34 [seh.ox.ac.uk, bsg.ox.ac.uk]\n",
"47 [foxmail.com]\n",
"103 [fvtm.bu.edu.eg]\n",
"297 [unipa.it]\n",
"299 [nhs.net]\n",
" ... \n",
"10746811 [gva.es, gmail.com]\n",
"10746850 [cinvestav.mx]\n",
"10746920 [gmail.com, hotmail.com]\n",
"10746975 [mail.ru]\n",
"10746988 [ucm.es]\n",
"Name: email_domains, Length: 141118, dtype: object"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['email_domains'].str.len() != 0]['email_domains']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting URL domains"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def extract_url_domains(lst):\n",
" domains = []\n",
" for e in lst:\n",
" # e[0] is a string describing the url\n",
" # e[1] is the url\n",
" ext = tldextract.extract(e[1])\n",
" domains.append(ext.registered_domain)\n",
" return domains"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5 [researchgate.net]\n",
"14 [tigerscaffolds.co.nz]\n",
"15 [corticalbrain.com]\n",
"29 [cnpq.br]\n",
"30 [sksahu.net]\n",
" ... \n",
"10746945 [telegra.ph]\n",
"10746950 [twitter.com, urbanfoodpolicy.com]\n",
"10746955 [openlearning.com]\n",
"10746984 [panaximco.vn]\n",
"10746987 [swansea.ac.uk]\n",
"Name: url_domains, Length: 688572, dtype: object"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['url_domains'].str.len() != 0]['url_domains']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>keywords</th>\n",
" <th>number of works</th>\n",
" <th>Crossref</th>\n",
" <th>Crossref Metadata Search</th>\n",
" <th>Scopus - Elsevier</th>\n",
" <th>n_education</th>\n",
" <th>n_employments</th>\n",
" <th>n_ext_ids</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>[isti.cnr.it]</td>\n",
" <td>[github.io, twitter.com, linkedin.com]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci <NA> <NA> \n",
"\n",
" keywords number of works \\\n",
"8840413 [\"Data science \", \"science of science\", \"schol... 37 \n",
"\n",
" Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n",
"8840413 1 1 1 4 \n",
"\n",
" n_employments n_ext_ids email_domains \\\n",
"8840413 5 1 [isti.cnr.it] \n",
"\n",
" url_domains \n",
"8840413 [github.io, twitter.com, linkedin.com] "
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fixing keywords"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9601705</th>\n",
" <td>0000-0002-8588-4196</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Pedro</td>\n",
" <td>Príncipe</td>\n",
" <td>Pedro Príncipe is an information, documentatio...</td>\n",
" <td>[\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"]</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[open access, open science, libraries, reposit...</td>\n",
" <td>[[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[\"Librarian / Project manager\", \"Universidade...</td>\n",
" <td>5</td>\n",
" <td>[\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"9601705 0000-0002-8588-4196 True True True \n",
"\n",
" given names family name \\\n",
"9601705 Pedro Príncipe \n",
"\n",
" biography \\\n",
"9601705 Pedro Príncipe is an information, documentatio... \n",
"\n",
" other names researcher urls \\\n",
"9601705 [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"] [] \n",
"\n",
" primary email other emails \\\n",
"9601705 [] \n",
"\n",
" keywords \\\n",
"9601705 [open access, open science, libraries, reposit... \n",
"\n",
" external identifiers education \\\n",
"9601705 [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]] <NA> \n",
"\n",
" employments number of works \\\n",
"9601705 [[\"Librarian / Project manager\", \"Universidade... 5 \n",
"\n",
" works source email_domains \\\n",
"9601705 [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"... [] \n",
"\n",
" url_domains \n",
"9601705 [] "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == PP]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def fix_keywords(lst):\n",
" fixed = []\n",
" for k in lst:\n",
" split = k.split(',')\n",
" fixed.extend(split)\n",
" return fixed"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['open access',\n",
" ' open science',\n",
" ' libraries',\n",
" ' repositories',\n",
" ' social web',\n",
" '']"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test = ['open access, open science, libraries, repositories, social web,']\n",
"fix_keywords(test)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" <th>fixed_keywords</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9517099</th>\n",
" <td>0000-0001-6997-9470</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>other</td>\n",
" <td>whatsapp</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[[Otherwhatsapp, https://otherwhatsapp.com/], ...</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[otherwhatsapp.com, im-creator.com, facebook.c...</td>\n",
" <td>[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"9517099 0000-0001-6997-9470 True True True \n",
"\n",
" given names family name biography other names \\\n",
"9517099 other whatsapp <NA> <NA> \n",
"\n",
" researcher urls primary email \\\n",
"9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... \n",
"\n",
" other emails keywords \\\n",
"9517099 [] [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n",
"\n",
" external identifiers education employments number of works \\\n",
"9517099 NaN <NA> <NA> 0 \n",
"\n",
" works source email_domains \\\n",
"9517099 <NA> [] \n",
"\n",
" url_domains \\\n",
"9517099 [otherwhatsapp.com, im-creator.com, facebook.c... \n",
"\n",
" fixed_keywords \n",
"9517099 [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == WHATSAPP]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" <th>fixed_keywords</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000-0001-5000-2053</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Jorge</td>\n",
" <td>Jaramillo Sanchez</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000-0001-5000-6548</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Wiseman</td>\n",
" <td>Bekelesi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000-0001-5000-7962</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>ALICE</td>\n",
" <td>INDIMULI</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000-0001-5000-8586</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>shim</td>\n",
" <td>ji yun</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000-0001-5001-0256</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Sandro</td>\n",
" <td>Caramaschi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747035</th>\n",
" <td>0000-0003-4998-1551</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Animesh</td>\n",
" <td>Ghosh</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747036</th>\n",
" <td>0000-0003-4998-4111</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Hawa</td>\n",
" <td>Liberna</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747037</th>\n",
" <td>0000-0003-4998-6045</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Tongyi</td>\n",
" <td>Men</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747038</th>\n",
" <td>0000-0003-4998-8868</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Charldon</td>\n",
" <td>Wilken</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747039</th>\n",
" <td>0000-0003-4999-7916</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Tapas Bapu</td>\n",
" <td>B.R.</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10747040 rows × 19 columns</p>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email \\\n",
"0 0000-0001-5000-2053 True False \n",
"1 0000-0001-5000-6548 True False \n",
"2 0000-0001-5000-7962 True True \n",
"3 0000-0001-5000-8586 True False \n",
"4 0000-0001-5001-0256 True False \n",
"... ... ... ... \n",
"10747035 0000-0003-4998-1551 True False \n",
"10747036 0000-0003-4998-4111 True False \n",
"10747037 0000-0003-4998-6045 True False \n",
"10747038 0000-0003-4998-8868 True True \n",
"10747039 0000-0003-4999-7916 True True \n",
"\n",
" verified primary email given names family name biography \\\n",
"0 False Jorge Jaramillo Sanchez <NA> \n",
"1 False Wiseman Bekelesi <NA> \n",
"2 True ALICE INDIMULI <NA> \n",
"3 False shim ji yun <NA> \n",
"4 False Sandro Caramaschi <NA> \n",
"... ... ... ... ... \n",
"10747035 False Animesh Ghosh <NA> \n",
"10747036 False Hawa Liberna <NA> \n",
"10747037 False Tongyi Men <NA> \n",
"10747038 False Charldon Wilken <NA> \n",
"10747039 True Tapas Bapu B.R. <NA> \n",
"\n",
" other names researcher urls primary email other emails \\\n",
"0 <NA> [] [] \n",
"1 <NA> [] [] \n",
"2 <NA> [] [] \n",
"3 <NA> [] [] \n",
"4 <NA> [] [] \n",
"... ... ... ... ... \n",
"10747035 <NA> [] [] \n",
"10747036 <NA> [] [] \n",
"10747037 <NA> [] [] \n",
"10747038 <NA> [] [] \n",
"10747039 <NA> [] [] \n",
"\n",
" external identifiers education employments number of works \\\n",
"0 NaN <NA> <NA> 0 \n",
"1 NaN <NA> <NA> 0 \n",
"2 NaN <NA> <NA> 0 \n",
"3 NaN <NA> <NA> 0 \n",
"4 NaN <NA> <NA> 0 \n",
"... ... ... ... ... \n",
"10747035 NaN <NA> <NA> 0 \n",
"10747036 NaN <NA> <NA> 0 \n",
"10747037 NaN <NA> <NA> 0 \n",
"10747038 NaN <NA> <NA> 0 \n",
"10747039 NaN <NA> <NA> 0 \n",
"\n",
" works source email_domains url_domains fixed_keywords \n",
"0 <NA> [] [] [] \n",
"1 <NA> [] [] [] \n",
"2 <NA> [] [] [] \n",
"3 <NA> [] [] [] \n",
"4 <NA> [] [] [] \n",
"... ... ... ... ... \n",
"10747035 <NA> [] [] [] \n",
"10747036 <NA> [] [] [] \n",
"10747037 <NA> [] [] [] \n",
"10747038 <NA> [] [] [] \n",
"10747039 <NA> [] [] [] \n",
"\n",
"[10747040 rows x 19 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop('keywords', axis=1, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fixes for other columns with lists inside"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))\n",
"# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
"# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature extraction"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# df['email_encoding'] = mlb.fit_transform(df['email_domains'])\n",
"# df['url_encoding'] = mlb.fit_transform(df['url_domains'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>email_domains</th>\n",
" <th>url_domains</th>\n",
" <th>fixed_keywords</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000-0001-5000-2053</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Jorge</td>\n",
" <td>Jaramillo Sanchez</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000-0001-5000-6548</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Wiseman</td>\n",
" <td>Bekelesi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000-0001-5000-7962</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>ALICE</td>\n",
" <td>INDIMULI</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000-0001-5000-8586</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>shim</td>\n",
" <td>ji yun</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000-0001-5001-0256</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Sandro</td>\n",
" <td>Caramaschi</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747035</th>\n",
" <td>0000-0003-4998-1551</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Animesh</td>\n",
" <td>Ghosh</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747036</th>\n",
" <td>0000-0003-4998-4111</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Hawa</td>\n",
" <td>Liberna</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747037</th>\n",
" <td>0000-0003-4998-6045</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>Tongyi</td>\n",
" <td>Men</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747038</th>\n",
" <td>0000-0003-4998-8868</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>Charldon</td>\n",
" <td>Wilken</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747039</th>\n",
" <td>0000-0003-4999-7916</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Tapas Bapu</td>\n",
" <td>B.R.</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10747040 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email \\\n",
"0 0000-0001-5000-2053 True False \n",
"1 0000-0001-5000-6548 True False \n",
"2 0000-0001-5000-7962 True True \n",
"3 0000-0001-5000-8586 True False \n",
"4 0000-0001-5001-0256 True False \n",
"... ... ... ... \n",
"10747035 0000-0003-4998-1551 True False \n",
"10747036 0000-0003-4998-4111 True False \n",
"10747037 0000-0003-4998-6045 True False \n",
"10747038 0000-0003-4998-8868 True True \n",
"10747039 0000-0003-4999-7916 True True \n",
"\n",
" verified primary email given names family name biography \\\n",
"0 False Jorge Jaramillo Sanchez <NA> \n",
"1 False Wiseman Bekelesi <NA> \n",
"2 True ALICE INDIMULI <NA> \n",
"3 False shim ji yun <NA> \n",
"4 False Sandro Caramaschi <NA> \n",
"... ... ... ... ... \n",
"10747035 False Animesh Ghosh <NA> \n",
"10747036 False Hawa Liberna <NA> \n",
"10747037 False Tongyi Men <NA> \n",
"10747038 False Charldon Wilken <NA> \n",
"10747039 True Tapas Bapu B.R. <NA> \n",
"\n",
" other names researcher urls primary email other emails keywords \\\n",
"0 <NA> [] [] [] \n",
"1 <NA> [] [] [] \n",
"2 <NA> [] [] [] \n",
"3 <NA> [] [] [] \n",
"4 <NA> [] [] [] \n",
"... ... ... ... ... ... \n",
"10747035 <NA> [] [] [] \n",
"10747036 <NA> [] [] [] \n",
"10747037 <NA> [] [] [] \n",
"10747038 <NA> [] [] [] \n",
"10747039 <NA> [] [] [] \n",
"\n",
" external identifiers education employments number of works \\\n",
"0 [] [] [] 0 \n",
"1 [] [] [] 0 \n",
"2 [] [] [] 0 \n",
"3 [] [] [] 0 \n",
"4 [] [] [] 0 \n",
"... ... ... ... ... \n",
"10747035 [] [] [] 0 \n",
"10747036 [] [] [] 0 \n",
"10747037 [] [] [] 0 \n",
"10747038 [] [] [] 0 \n",
"10747039 [] [] [] 0 \n",
"\n",
" works source email_domains url_domains fixed_keywords \n",
"0 [] [] [] [] \n",
"1 [] [] [] [] \n",
"2 [] [] [] [] \n",
"3 [] [] [] [] \n",
"4 [] [] [] [] \n",
"... ... ... ... ... \n",
"10747035 [] [] [] [] \n",
"10747036 [] [] [] [] \n",
"10747037 [] [] [] [] \n",
"10747038 [] [] [] [] \n",
"10747039 [] [] [] [] \n",
"\n",
"[10747040 rows x 20 columns]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}