You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

82 KiB

Todo in data

  • Column names -> no space
  • If a list is empty, serialise [] in the csv
  • If a string is empty, serialise '' in the csv
In [1]:
import ast
from urllib.parse import urlparse
import tldextract

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
In [2]:
mlb = MultiLabelBinarizer()
In [3]:
# Notable Solid ORCID iDs for debug purposes
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
In [4]:
# Notable fake ORCID iDs for debug purposes
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
In [5]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header = 0,
                         dtype = {"orcid": pd.StringDtype(), 
                                  "claimed": bool, 
                                  "verifyed email": bool, 
                                  "verified primary email": bool,
                                  "given names": pd.StringDtype(),
                                  "family name": pd.StringDtype(),
                                  "biography": pd.StringDtype(),
                                  "other names": pd.StringDtype(),
                                  "researcher urls": pd.StringDtype(),
                                  "primary email": pd.StringDtype(),
                                  "other emails": pd.StringDtype(),
                                  "keywords": pd.StringDtype(),
                                  "eternal identifiers": pd.StringDtype(),
                                  "education": pd.StringDtype(),
                                  "employments": pd.StringDtype(),
                                  "number of works": pd.Int16Dtype(),
                                  "works source": pd.StringDtype()})
In [6]:
df.head(5)
Out[6]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers education employments number of works works source
0 0000-0001-5000-2053 True False False Jorge Jaramillo Sanchez <NA> <NA> <NA> <NA> <NA> <NA> NaN <NA> <NA> 0 <NA>
1 0000-0001-5000-6548 True False False Wiseman Bekelesi <NA> <NA> <NA> <NA> <NA> <NA> NaN <NA> <NA> 0 <NA>
2 0000-0001-5000-7962 True True True ALICE INDIMULI <NA> <NA> <NA> <NA> <NA> <NA> NaN <NA> <NA> 0 <NA>
3 0000-0001-5000-8586 True False False shim ji yun <NA> <NA> <NA> <NA> <NA> <NA> NaN <NA> <NA> 0 <NA>
4 0000-0001-5001-0256 True False False Sandro Caramaschi <NA> <NA> <NA> <NA> <NA> <NA> NaN <NA> <NA> 0 <NA>
In [7]:
df[df['orcid'] == AM]
Out[7]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers education employments number of works works source
8840413 0000-0002-5193-7851 True True True Andrea Mannocci <NA> <NA> [["Personal website", "https://andremann.githu... andrea.mannocci@isti.cnr.it <NA> ["Data science ", "science of science", "schol... "Scopus Author ID", "55233589900" [["Information engineering", "Ph.D.", "Univers... [["Research Associate", "Istituto di Scienza e... 37 ["Scopus - Elsevier", "Crossref Metadata Searc...

Extracting works source

In [8]:
df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [9]:
def extract_work_source(lst):
    extracted = []
    for s in lst:
        if 'Scopus - Elsevier' in s or 'Crossref' in s:
            extracted.append(s)
    return extracted
In [10]:
df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))
In [11]:
df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)
In [12]:
df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)
In [13]:
df[df['orcid'] == AM]
Out[13]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers education employments number of works Crossref Crossref Metadata Search Scopus - Elsevier
8840413 0000-0002-5193-7851 True True True Andrea Mannocci <NA> <NA> [["Personal website", "https://andremann.githu... andrea.mannocci@isti.cnr.it <NA> ["Data science ", "science of science", "schol... "Scopus Author ID", "55233589900" [["Information engineering", "Ph.D.", "Univers... [["Research Associate", "Istituto di Scienza e... 37 1 1 1

Education

In [14]:
df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [15]:
df['n_education'] = df['education'].str.len()
In [16]:
df.drop('education', axis=1, inplace=True)
In [17]:
df[df['orcid'] == AM]
Out[17]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers employments number of works Crossref Crossref Metadata Search Scopus - Elsevier n_education
8840413 0000-0002-5193-7851 True True True Andrea Mannocci <NA> <NA> [["Personal website", "https://andremann.githu... andrea.mannocci@isti.cnr.it <NA> ["Data science ", "science of science", "schol... "Scopus Author ID", "55233589900" [["Research Associate", "Istituto di Scienza e... 37 1 1 1 4

Employment

In [18]:
df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [19]:
df['n_employments'] = df['employments'].str.len()
In [20]:
df.drop('employments', axis=1, inplace=True)
In [21]:
df[df['orcid'] == AM]
Out[21]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers number of works Crossref Crossref Metadata Search Scopus - Elsevier n_education n_employments
8840413 0000-0002-5193-7851 True True True Andrea Mannocci <NA> <NA> [["Personal website", "https://andremann.githu... andrea.mannocci@isti.cnr.it <NA> ["Data science ", "science of science", "schol... "Scopus Author ID", "55233589900" 37 1 1 1 4 5

External IDs

In [22]:
df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [23]:
# def extract_ids(lst):
#     extracted = []
#     for id in lst:
#         extracted.append(id[0])
#     return extracted
In [24]:
# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))
In [25]:
# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)
In [26]:
df['n_ext_ids'] = df['external identifiers'].str.len()
In [27]:
df.drop(['external identifiers'], axis=1, inplace=True)
In [28]:
df[df['orcid'] == AM]
Out[28]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords number of works Crossref Crossref Metadata Search Scopus - Elsevier n_education n_employments n_ext_ids
8840413 0000-0002-5193-7851 True True True Andrea Mannocci <NA> <NA> [["Personal website", "https://andremann.githu... andrea.mannocci@isti.cnr.it <NA> ["Data science ", "science of science", "schol... 37 1 1 1 4 5 1

Extracting email domains

In [29]:
df['primary email'] = df['primary email'].fillna('')
df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [30]:
def extract_email_domains(row):
    domains = []
    if len(row['primary email']) > 0:
        domains.append(row['primary email'].split('@')[1])
    for email in row['other emails']:
        domains.append(email.split('@')[1])
    return domains
In [31]:
df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)
In [32]:
df[df['email_domains'].str.len() != 0]['email_domains']
Out[32]:
34          [seh.ox.ac.uk, bsg.ox.ac.uk]
47                         [foxmail.com]
103                     [fvtm.bu.edu.eg]
297                           [unipa.it]
299                            [nhs.net]
                        ...             
10746811             [gva.es, gmail.com]
10746850                  [cinvestav.mx]
10746920        [gmail.com, hotmail.com]
10746975                       [mail.ru]
10746988                        [ucm.es]
Name: email_domains, Length: 141118, dtype: object
In [ ]:
df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)
In [ ]:
df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)
In [ ]:
df[df['orcid'] == AM]

Extracting URL domains

In [35]:
df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [36]:
def extract_url_domains(lst):
    domains = []
    for e in lst:
        # e[0] is a string describing the url
        # e[1] is the url
        ext = tldextract.extract(e[1])
        domains.append(ext.registered_domain)
    return domains
In [37]:
df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))
In [38]:
df[df['url_domains'].str.len() != 0]['url_domains']
Out[38]:
5                           [researchgate.net]
14                      [tigerscaffolds.co.nz]
15                         [corticalbrain.com]
29                                   [cnpq.br]
30                                [sksahu.net]
                           ...                
10746945                          [telegra.ph]
10746950    [twitter.com, urbanfoodpolicy.com]
10746955                    [openlearning.com]
10746984                        [panaximco.vn]
10746987                       [swansea.ac.uk]
Name: url_domains, Length: 688572, dtype: object
In [ ]:
df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)
In [39]:
df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)
In [40]:
df[df['orcid'] == AM]
Out[40]:
orcid claimed verifyed email verified primary email given names family name biography other names keywords number of works Crossref Crossref Metadata Search Scopus - Elsevier n_education n_employments n_ext_ids email_domains url_domains
8840413 0000-0002-5193-7851 True True True Andrea Mannocci <NA> <NA> ["Data science ", "science of science", "schol... 37 1 1 1 4 5 1 [isti.cnr.it] [github.io, twitter.com, linkedin.com]

Fixing keywords

In [18]:
df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))

Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g.

In [19]:
df[df['orcid'] == PP]
Out[19]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers education employments number of works works source email_domains url_domains
9601705 0000-0002-8588-4196 True True True Pedro Príncipe Pedro Príncipe is an information, documentatio... ["Pedro Miguel de Oliveira Bento Pr\u00edncipe"] [] [] [open access, open science, libraries, reposit... "Ci\u00eancia ID", "C915-48B2-6C87" <NA> [["Librarian / Project manager", "Universidade... 5 ["CI\u00caNCIAVITAE", "Pedro Pr\u00edncipe", "... [] []
In [20]:
def fix_keywords(lst):
    fixed = []
    for k in lst:
        split = k.split(',')
        fixed.extend(split)
    return fixed
In [21]:
test = ['open access, open science, libraries, repositories, social web,']
fix_keywords(test)
Out[21]:
['open access',
 ' open science',
 ' libraries',
 ' repositories',
 ' social web',
 '']
In [22]:
df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))
In [23]:
df[df['orcid'] == WHATSAPP]
Out[23]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers education employments number of works works source email_domains url_domains fixed_keywords
9517099 0000-0001-6997-9470 True True True other whatsapp <NA> <NA> [[Otherwhatsapp, https://otherwhatsapp.com/], ... [] [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... NaN <NA> <NA> 0 <NA> [] [otherwhatsapp.com, im-creator.com, facebook.c... [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...
In [24]:
df.drop('keywords', axis=1, inplace=True)
Out[24]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails external identifiers education employments number of works works source email_domains url_domains fixed_keywords
0 0000-0001-5000-2053 True False False Jorge Jaramillo Sanchez <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
1 0000-0001-5000-6548 True False False Wiseman Bekelesi <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
2 0000-0001-5000-7962 True True True ALICE INDIMULI <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
3 0000-0001-5000-8586 True False False shim ji yun <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
4 0000-0001-5001-0256 True False False Sandro Caramaschi <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10747035 0000-0003-4998-1551 True False False Animesh Ghosh <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
10747036 0000-0003-4998-4111 True False False Hawa Liberna <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
10747037 0000-0003-4998-6045 True False False Tongyi Men <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
10747038 0000-0003-4998-8868 True True False Charldon Wilken <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []
10747039 0000-0003-4999-7916 True True True Tapas Bapu B.R. <NA> <NA> [] [] NaN <NA> <NA> 0 <NA> [] [] []

10747040 rows × 19 columns

Fixes for other columns with lists inside

In [27]:
# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))
# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))
# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))

Feature extraction

In [28]:
# df['email_encoding'] = mlb.fit_transform(df['email_domains'])
# df['url_encoding'] = mlb.fit_transform(df['url_domains'])
In [29]:
df
Out[29]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers education employments number of works works source email_domains url_domains fixed_keywords
0 0000-0001-5000-2053 True False False Jorge Jaramillo Sanchez <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
1 0000-0001-5000-6548 True False False Wiseman Bekelesi <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
2 0000-0001-5000-7962 True True True ALICE INDIMULI <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
3 0000-0001-5000-8586 True False False shim ji yun <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
4 0000-0001-5001-0256 True False False Sandro Caramaschi <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10747035 0000-0003-4998-1551 True False False Animesh Ghosh <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
10747036 0000-0003-4998-4111 True False False Hawa Liberna <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
10747037 0000-0003-4998-6045 True False False Tongyi Men <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
10747038 0000-0003-4998-8868 True True False Charldon Wilken <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []
10747039 0000-0003-4999-7916 True True True Tapas Bapu B.R. <NA> <NA> [] [] [] [] [] [] 0 [] [] [] []

10747040 rows × 20 columns