Todo in data

Column names -> no space
If a list is empty, serialise [] in the csv
If a string is empty, serialise '' in the csv

In [1]:

import ast
from urllib.parse import urlparse
import tldextract

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:

mlb = MultiLabelBinarizer()

In [3]:

# Notable Solid ORCID iDs for debug purposes
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'

In [4]:

# Notable fake ORCID iDs for debug purposes
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'

In [5]:

df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header = 0,
                         dtype = {"orcid": pd.StringDtype(), 
                                  "claimed": bool, 
                                  "verifyed email": bool, 
                                  "verified primary email": bool,
                                  "given names": pd.StringDtype(),
                                  "family name": pd.StringDtype(),
                                  "biography": pd.StringDtype(),
                                  "other names": pd.StringDtype(),
                                  "researcher urls": pd.StringDtype(),
                                  "primary email": pd.StringDtype(),
                                  "other emails": pd.StringDtype(),
                                  "keywords": pd.StringDtype(),
                                  "eternal identifiers": pd.StringDtype(),
                                  "education": pd.StringDtype(),
                                  "employments": pd.StringDtype(),
                                  "number of works": pd.Int16Dtype(),
                                  "works source": pd.StringDtype()})

In [6]:

df.head(5)

Out[6]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	education	employments	works source
0	0000-0001-5000-2053	True	False	False	Jorge	Jaramillo Sanchez	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	<NA>	<NA>
1	0000-0001-5000-6548	True	False	False	Wiseman	Bekelesi	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	<NA>	<NA>
2	0000-0001-5000-7962	True	True	True	ALICE	INDIMULI	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	<NA>	<NA>
3	0000-0001-5000-8586	True	False	False	shim	ji yun	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	<NA>	<NA>
4	0000-0001-5001-0256	True	False	False	Sandro	Caramaschi	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	<NA>	<NA>

In [7]:

df[df['orcid'] == AM]

Out[7]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	education	employments	number of works	works source
8840413	0000-0002-5193-7851	True	True	True	Andrea	Mannocci	<NA>	<NA>	[["Personal website", "https://andremann.githu...	andrea.mannocci@isti.cnr.it	<NA>	["Data science ", "science of science", "schol...	"Scopus Author ID", "55233589900"	[["Information engineering", "Ph.D.", "Univers...	[["Research Associate", "Istituto di Scienza e...	37	["Scopus - Elsevier", "Crossref Metadata Searc...

Extracting works source¶

In [8]:

df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [9]:

def extract_work_source(lst):
    extracted = []
    for s in lst:
        if 'Scopus - Elsevier' in s or 'Crossref' in s:
            extracted.append(s)
    return extracted

In [10]:

df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))

In [11]:

df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)

In [12]:

df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)

In [13]:

df[df['orcid'] == AM]

Out[13]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	education	employments	number of works	Crossref	Crossref Metadata Search	Scopus - Elsevier
8840413	0000-0002-5193-7851	True	True	True	Andrea	Mannocci	<NA>	<NA>	[["Personal website", "https://andremann.githu...	andrea.mannocci@isti.cnr.it	<NA>	["Data science ", "science of science", "schol...	"Scopus Author ID", "55233589900"	[["Information engineering", "Ph.D.", "Univers...	[["Research Associate", "Istituto di Scienza e...	37	1	1	1

Education¶

In [14]:

df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [15]:

df['n_education'] = df['education'].str.len()

In [16]:

df.drop('education', axis=1, inplace=True)

In [17]:

df[df['orcid'] == AM]

Out[17]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	employments	number of works	Crossref	Crossref Metadata Search	Scopus - Elsevier	n_education
8840413	0000-0002-5193-7851	True	True	True	Andrea	Mannocci	<NA>	<NA>	[["Personal website", "https://andremann.githu...	andrea.mannocci@isti.cnr.it	<NA>	["Data science ", "science of science", "schol...	"Scopus Author ID", "55233589900"	[["Research Associate", "Istituto di Scienza e...	37	1	1	1	4

Employment¶

In [18]:

df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [19]:

df['n_employments'] = df['employments'].str.len()

In [20]:

df.drop('employments', axis=1, inplace=True)

In [21]:

df[df['orcid'] == AM]

Out[21]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	number of works	Crossref	Crossref Metadata Search	Scopus - Elsevier	n_education	n_employments
8840413	0000-0002-5193-7851	True	True	True	Andrea	Mannocci	<NA>	<NA>	[["Personal website", "https://andremann.githu...	andrea.mannocci@isti.cnr.it	<NA>	["Data science ", "science of science", "schol...	"Scopus Author ID", "55233589900"	37	1	1	1	4	5

External IDs¶

In [22]:

df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [23]:

# def extract_ids(lst):
#     extracted = []
#     for id in lst:
#         extracted.append(id[0])
#     return extracted

In [24]:

# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))

In [25]:

# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)

In [26]:

df['n_ext_ids'] = df['external identifiers'].str.len()

In [27]:

df.drop(['external identifiers'], axis=1, inplace=True)

In [28]:

df[df['orcid'] == AM]

Out[28]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	number of works	Crossref	Crossref Metadata Search	Scopus - Elsevier	n_education	n_employments	n_ext_ids
8840413	0000-0002-5193-7851	True	True	True	Andrea	Mannocci	<NA>	<NA>	[["Personal website", "https://andremann.githu...	andrea.mannocci@isti.cnr.it	<NA>	["Data science ", "science of science", "schol...	37	1	1	1	4	5	1

Extracting email domains¶

In [29]:

df['primary email'] = df['primary email'].fillna('')
df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [30]:

def extract_email_domains(row):
    domains = []
    if len(row['primary email']) > 0:
        domains.append(row['primary email'].split('@')[1])
    for email in row['other emails']:
        domains.append(email.split('@')[1])
    return domains

In [31]:

df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)

In [32]:

df[df['email_domains'].str.len() != 0]['email_domains']

Out[32]:

34          [seh.ox.ac.uk, bsg.ox.ac.uk]
47                         [foxmail.com]
103                     [fvtm.bu.edu.eg]
297                           [unipa.it]
299                            [nhs.net]
                        ...             
10746811             [gva.es, gmail.com]
10746850                  [cinvestav.mx]
10746920        [gmail.com, hotmail.com]
10746975                       [mail.ru]
10746988                        [ucm.es]
Name: email_domains, Length: 141118, dtype: object

In [ ]:

df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)

In [ ]:

df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)

In [ ]:

df[df['orcid'] == AM]

Extracting URL domains¶

In [35]:

df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [36]:

def extract_url_domains(lst):
    domains = []
    for e in lst:
        # e[0] is a string describing the url
        # e[1] is the url
        ext = tldextract.extract(e[1])
        domains.append(ext.registered_domain)
    return domains

In [37]:

df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))

In [38]:

df[df['url_domains'].str.len() != 0]['url_domains']

Out[38]:

5                           [researchgate.net]
14                      [tigerscaffolds.co.nz]
15                         [corticalbrain.com]
29                                   [cnpq.br]
30                                [sksahu.net]
                           ...                
10746945                          [telegra.ph]
10746950    [twitter.com, urbanfoodpolicy.com]
10746955                    [openlearning.com]
10746984                        [panaximco.vn]
10746987                       [swansea.ac.uk]
Name: url_domains, Length: 688572, dtype: object

In [ ]:

df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)

In [39]:

df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)

In [40]:

df[df['orcid'] == AM]

Out[40]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	keywords	number of works	Crossref	Crossref Metadata Search	Scopus - Elsevier	n_education	n_employments	n_ext_ids	email_domains	url_domains
8840413	0000-0002-5193-7851	True	True	True	Andrea	Mannocci	<NA>	<NA>	["Data science ", "science of science", "schol...	37	1	1	1	4	5	1	[isti.cnr.it]	[github.io, twitter.com, linkedin.com]

Fixing keywords¶

In [18]:

df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))

Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g.

In [19]:

df[df['orcid'] == PP]

Out[19]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	education	employments	number of works	works source	email_domains	url_domains
9601705	0000-0002-8588-4196	True	True	True	Pedro	Príncipe	Pedro Príncipe is an information, documentatio...	["Pedro Miguel de Oliveira Bento Pr\u00edncipe"]	[]		[]	[open access, open science, libraries, reposit...	"Ci\u00eancia ID", "C915-48B2-6C87"	<NA>	[["Librarian / Project manager", "Universidade...	5	["CI\u00caNCIAVITAE", "Pedro Pr\u00edncipe", "...	[]	[]

In [20]:

def fix_keywords(lst):
    fixed = []
    for k in lst:
        split = k.split(',')
        fixed.extend(split)
    return fixed

In [21]:

test = ['open access, open science, libraries, repositories, social web,']
fix_keywords(test)

Out[21]:

['open access',
 ' open science',
 ' libraries',
 ' repositories',
 ' social web',
 '']

In [22]:

df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))

In [23]:

df[df['orcid'] == WHATSAPP]

Out[23]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	education	employments	number of works	works source	email_domains	url_domains	fixed_keywords
9517099	0000-0001-6997-9470	True	True	True	other	whatsapp	<NA>	<NA>	[[Otherwhatsapp, https://otherwhatsapp.com/], ...		[]	[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...	NaN	<NA>	<NA>	0	<NA>	[]	[otherwhatsapp.com, im-creator.com, facebook.c...	[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...

In [24]:

df.drop('keywords', axis=1, inplace=True)

Out[24]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	external identifiers	education	employments	number of works	works source	email_domains	url_domains	fixed_keywords
0	0000-0001-5000-2053	True	False	False	Jorge	Jaramillo Sanchez	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
1	0000-0001-5000-6548	True	False	False	Wiseman	Bekelesi	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
2	0000-0001-5000-7962	True	True	True	ALICE	INDIMULI	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
3	0000-0001-5000-8586	True	False	False	shim	ji yun	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
4	0000-0001-5001-0256	True	False	False	Sandro	Caramaschi	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10747035	0000-0003-4998-1551	True	False	False	Animesh	Ghosh	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
10747036	0000-0003-4998-4111	True	False	False	Hawa	Liberna	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
10747037	0000-0003-4998-6045	True	False	False	Tongyi	Men	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
10747038	0000-0003-4998-8868	True	True	False	Charldon	Wilken	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]
10747039	0000-0003-4999-7916	True	True	True	Tapas Bapu	B.R.	<NA>	<NA>	[]		[]	NaN	<NA>	<NA>	0	<NA>	[]	[]	[]

10747040 rows × 19 columns

Fixes for other columns with lists inside¶

In [27]:

# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))
# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))

Feature extraction¶

In [28]:

# df['email_encoding'] = mlb.fit_transform(df['email_domains'])
# df['url_encoding'] = mlb.fit_transform(df['url_domains'])

In [29]:

df

Out[29]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	education	employments	number of works	works source	email_domains	url_domains	fixed_keywords
0	0000-0001-5000-2053	True	False	False	Jorge	Jaramillo Sanchez	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
1	0000-0001-5000-6548	True	False	False	Wiseman	Bekelesi	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
2	0000-0001-5000-7962	True	True	True	ALICE	INDIMULI	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
3	0000-0001-5000-8586	True	False	False	shim	ji yun	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
4	0000-0001-5001-0256	True	False	False	Sandro	Caramaschi	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10747035	0000-0003-4998-1551	True	False	False	Animesh	Ghosh	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
10747036	0000-0003-4998-4111	True	False	False	Hawa	Liberna	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
10747037	0000-0003-4998-6045	True	False	False	Tongyi	Men	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
10747038	0000-0003-4998-8868	True	True	False	Charldon	Wilken	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]
10747039	0000-0003-4999-7916	True	True	True	Tapas Bapu	B.R.	<NA>	<NA>	[]		[]	[]	[]	[]	[]	0	[]	[]	[]	[]

10747040 rows × 20 columns

82 KiB Raw Blame History Unescape Escape

Extracting works source¶

Education¶

Employment¶

External IDs¶

Extracting email domains¶

Extracting URL domains¶

Fixing keywords¶

Fixes for other columns with lists inside¶

Feature extraction¶

82 KiB

Raw Blame History Unescape Escape