82 KiB
82 KiB
Todo in data
- Column names -> no space
- If a list is empty, serialise [] in the csv
- If a string is empty, serialise '' in the csv
In [1]:
import ast
from urllib.parse import urlparse
import tldextract
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
In [2]:
mlb = MultiLabelBinarizer()
In [3]:
# Notable Solid ORCID iDs for debug purposes
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
In [4]:
# Notable fake ORCID iDs for debug purposes
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
In [5]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header = 0,
dtype = {"orcid": pd.StringDtype(),
"claimed": bool,
"verifyed email": bool,
"verified primary email": bool,
"given names": pd.StringDtype(),
"family name": pd.StringDtype(),
"biography": pd.StringDtype(),
"other names": pd.StringDtype(),
"researcher urls": pd.StringDtype(),
"primary email": pd.StringDtype(),
"other emails": pd.StringDtype(),
"keywords": pd.StringDtype(),
"eternal identifiers": pd.StringDtype(),
"education": pd.StringDtype(),
"employments": pd.StringDtype(),
"number of works": pd.Int16Dtype(),
"works source": pd.StringDtype()})
In [6]:
df.head(5)
Out[6]:
In [7]:
df[df['orcid'] == AM]
Out[7]:
Extracting works source¶
In [8]:
df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [9]:
def extract_work_source(lst):
extracted = []
for s in lst:
if 'Scopus - Elsevier' in s or 'Crossref' in s:
extracted.append(s)
return extracted
In [10]:
df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))
In [11]:
df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)
In [12]:
df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)
In [13]:
df[df['orcid'] == AM]
Out[13]:
Education¶
In [14]:
df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [15]:
df['n_education'] = df['education'].str.len()
In [16]:
df.drop('education', axis=1, inplace=True)
In [17]:
df[df['orcid'] == AM]
Out[17]:
Employment¶
In [18]:
df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [19]:
df['n_employments'] = df['employments'].str.len()
In [20]:
df.drop('employments', axis=1, inplace=True)
In [21]:
df[df['orcid'] == AM]
Out[21]:
External IDs¶
In [22]:
df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [23]:
# def extract_ids(lst):
# extracted = []
# for id in lst:
# extracted.append(id[0])
# return extracted
In [24]:
# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))
In [25]:
# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)
In [26]:
df['n_ext_ids'] = df['external identifiers'].str.len()
In [27]:
df.drop(['external identifiers'], axis=1, inplace=True)
In [28]:
df[df['orcid'] == AM]
Out[28]:
Extracting email domains¶
In [29]:
df['primary email'] = df['primary email'].fillna('')
df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [30]:
def extract_email_domains(row):
domains = []
if len(row['primary email']) > 0:
domains.append(row['primary email'].split('@')[1])
for email in row['other emails']:
domains.append(email.split('@')[1])
return domains
In [31]:
df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)
In [32]:
df[df['email_domains'].str.len() != 0]['email_domains']
Out[32]:
In [ ]:
df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)
In [ ]:
df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)
In [ ]:
df[df['orcid'] == AM]
Extracting URL domains¶
In [35]:
df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [36]:
def extract_url_domains(lst):
domains = []
for e in lst:
# e[0] is a string describing the url
# e[1] is the url
ext = tldextract.extract(e[1])
domains.append(ext.registered_domain)
return domains
In [37]:
df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))
In [38]:
df[df['url_domains'].str.len() != 0]['url_domains']
Out[38]:
In [ ]:
df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)
In [39]:
df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)
In [40]:
df[df['orcid'] == AM]
Out[40]:
Fixing keywords¶
In [18]:
df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))
Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g.
In [19]:
df[df['orcid'] == PP]
Out[19]:
In [20]:
def fix_keywords(lst):
fixed = []
for k in lst:
split = k.split(',')
fixed.extend(split)
return fixed
In [21]:
test = ['open access, open science, libraries, repositories, social web,']
fix_keywords(test)
Out[21]:
In [22]:
df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))
In [23]:
df[df['orcid'] == WHATSAPP]
Out[23]:
In [24]:
df.drop('keywords', axis=1, inplace=True)
Out[24]:
Fixes for other columns with lists inside¶
In [27]:
# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))
# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))
Feature extraction¶
In [28]:
# df['email_encoding'] = mlb.fit_transform(df['email_domains'])
# df['url_encoding'] = mlb.fit_transform(df['url_domains'])
In [29]:
df
Out[29]: