Dataset preprocessing¶

In [49]:

import pandas as pd
import ast
import tldextract
import numpy

In [50]:

# Notable Solid ORCID iDs for debug purposes
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'

In [51]:

# Notable fake ORCID iDs for debug purposes
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'

In [52]:

df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header=0,
                         dtype = {'orcid': pd.StringDtype(), 
                                  'claimed': bool, 
                                  'verifyed email': bool, 
                                  'verified primary email': bool,
                                  'given names': pd.StringDtype(),
                                  'family name': pd.StringDtype(),
                                  'biography': pd.StringDtype(),
                                  'other names': pd.StringDtype(),
                                  'researcher urls': pd.StringDtype(),
                                  'primary email': pd.StringDtype(),
                                  'other emails': pd.StringDtype(),
                                  'keywords': pd.StringDtype(),
                                  'external identifiers': pd.StringDtype(),
                                  'education': pd.StringDtype(),
                                  'employments': pd.StringDtype(),
                                  'number of works': pd.Int16Dtype(),
                                  'works source': pd.StringDtype()})

In [53]:

df['given names'] = df['given names'].fillna('')
df['family name'] = df['family name'].fillna('')
df['biography'] = df['biography'].fillna('')
df['primary email'] = df['primary email'].fillna('')

In [54]:

df['other names'] = df['other names'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [55]:

df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [59]:

df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))

In [57]:

df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [58]:

def extract_url_domains(lst):
    domains = []
    for e in lst:
        # e[0] is a string describing the url
        # e[1] is the url
        domain = tldextract.extract(e[1])
        domains.append(domain.registered_domain)
    return domains

In [60]:

df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [61]:

def extract_education(lst):
    educations = []
    for e in lst:
        # e[0] degree
        # e[1] role
        # e[2] university
        # e[..] city, region, country, id, id_scheme
        educations.append(' '.join([e[0], e[1], e[2]]))
    return educations

In [62]:

df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [63]:

def extract_employment(lst):
    res = []
    for e in lst:
        # e[0] role
        # e[1] institute
        # e[..] city, region, country, id, id_scheme
        res.append(' '.join([e[0], e[1]]))
    return res

In [56]:

df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [64]:

def extract_email_domains(lst):
    res = []
    for email in lst:
        res.append(email.replace('@', ' '))
    return res

In [81]:

df['concat'] =  df['given names'] + ' ' + df['family name'] + '\n' + \
                df['other names'].apply(lambda x: ' '.join(x)) + '\n' + \
                df['primary email'].values[0].replace('@', ' ') + '\n' + \
                df['other emails'].apply(lambda x: ' '.join(extract_email_domains(x))) + '\n' + \
                df['biography'] + '\n' + \
                df['keywords'].apply(lambda x: ' - '.join(x)) + '\n' + \
                df['url_domains'].apply(lambda x: ' '.join(x)) + '\n' + \
                df['education'].apply(lambda x: '\n'.join(extract_education(x))) + '\n' + \
                df['employments'].apply(lambda x: '\n'.join(extract_employment(x)))

In [82]:

print(df[df['orcid'] == AM]['concat'].values[0])

Andrea Mannocci




Data science  - science of science - scholarly knowledge mining - open science - research infrastructures
github.io twitter.com linkedin.com
Information engineering Ph.D. Università degli Studi di Pisa
Telematics engineering M.Sc. Universidad Carlos III de Madrid
Computer engineering M.Sc. Università degli Studi di Pisa
Computer engineering B.Sc. Università degli Studi di Pisa
Research Associate Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche
Research Associate The Open University
Research assistant Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche
Research assistant IMDEA Networks
Research assistant Syddansk Universitet

In [19]:

df.loc[df['number of works'] > 0, 'label'] = True

In [20]:

df[df['orcid'] == AM]

Out[20]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	education	employments	number of works	works source	url_domains	concat	label
8840413	0000-0002-5193-7851	True	True	True	Andrea	Mannocci		[]	[[Personal website, https://andremann.github.i...	andrea.mannocci@isti.cnr.it	[]	[Data science , science of science, scholarly ...	"Scopus Author ID", "55233589900"	[[Information engineering, Ph.D., Università d...	[[Research Associate, Istituto di Scienza e Te...	37	["Scopus - Elsevier", "Crossref Metadata Searc...	[github.io, twitter.com, linkedin.com]	0000-0002-5193-7851 Andrea Mannocci andrea.ma...	True

In [21]:

df[df['orcid'] == SCAFFOLD]

Out[21]:

	orcid	claimed	verifyed email	verified primary email	given names	family name	biography	other names	researcher urls	primary email	other emails	keywords	external identifiers	education	employments	number of works	works source	url_domains	concat	label
14	0000-0001-5004-7761	True	True	True	scaffolding	hire		[The first feature that you have to check in t...	[[scaffolding hire Wellington, https://www.tig...		[]	[scaffolding hire Wellington]	<NA>	[]	[]	0	<NA>	[tigerscaffolds.co.nz]	0000-0001-5004-7761 scaffolding hire The first...	NaN

In [25]:

df = df[['orcid', 'concat', 'label']]
df

Out[25]:

	concat	label
0	0000-0001-5000-2053 Jorge Jaramillo Sanchez ...	NaN
1	0000-0001-5000-6548 Wiseman Bekelesi	NaN
2	0000-0001-5000-7962 ALICE INDIMULI	NaN
3	0000-0001-5000-8586 shim ji yun	NaN
4	0000-0001-5001-0256 Sandro Caramaschi	NaN
...	...	...
10747035	0000-0003-4998-1551 Animesh Ghosh	NaN
10747036	0000-0003-4998-4111 Hawa Liberna	NaN
10747037	0000-0003-4998-6045 Tongyi Men	NaN
10747038	0000-0003-4998-8868 Charldon Wilken	NaN
10747039	0000-0003-4999-7916 Tapas Bapu B.R.	NaN

10747040 rows × 2 columns

Pre-trained spam filter as-is¶

In [83]:

import string
import torch
import transformers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-83-72f6535bcb9f> in <module>
      4 import numpy as np # linear algebra
      5 import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
----> 6 import seaborn as sns
      7 import matplotlib.pyplot as plt
      8 from nltk.corpus import stopwords

ModuleNotFoundError: No module named 'seaborn'

One-Class SVM¶

In [41]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split

In [44]:

samples = df[df['label'] == True]
vectorizer = TfidfVectorizer()
samples['features'] = vectorizer.fit_transform(samples['concat'])

<ipython-input-44-8393c5b42618>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['features'] = vectorizer.fit_transform(samples['concat'])

In [46]:

samples['features']

Out[46]:

5             (0, 983769)\t0.04916990678988556\n  (0, 1177...
13            (0, 983769)\t0.04916990678988556\n  (0, 1177...
24            (0, 983769)\t0.04916990678988556\n  (0, 1177...
26            (0, 983769)\t0.04916990678988556\n  (0, 1177...
29            (0, 983769)\t0.04916990678988556\n  (0, 1177...
                                  ...                        
10747024      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747026      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747027      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747030      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747034      (0, 983769)\t0.04916990678988556\n  (0, 1177...
Name: features, Length: 2674451, dtype: object

In [ ]:

trainX, testX, trainy, testy = train_test_split(samples['features'], samples['label'], test_size=0.7, random_state=3, stratify=y)

In [ ]:

model = OneClassSVM(gamma='scale', nu=0.01)
model.fit(trainX)

Training¶

In [118]:

model.fit(df[df['label'] == True])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-118-78458a59bca9> in <module>
----> 1 model.fit(df[df['label'] == True])

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_classes.py in fit(self, X, y, sample_weight, **params)
   1374 
   1375         """
-> 1376         super().fit(X, np.ones(_num_samples(X)),
   1377                     sample_weight=sample_weight, **params)
   1378         self.offset_ = -self._intercept_

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_base.py in fit(self, X, y, sample_weight)
    167             check_consistent_length(X, y)
    168         else:
--> 169             X, y = self._validate_data(X, y, dtype=np.float64,
    170                                        order='C', accept_sparse='csr',
    171                                        accept_large_sparse=False)

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    431                 y = check_array(y, **check_y_params)
    432             else:
--> 433                 X, y = check_X_y(X, y, **check_params)
    434             out = X, y
    435 

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    812         raise ValueError("y cannot be None")
    813 
--> 814     X = check_array(X, accept_sparse=accept_sparse,
    815                     accept_large_sparse=accept_large_sparse,
    816                     dtype=dtype, order=order, copy=copy,

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    558     if has_pd_integer_array:
    559         # If there are any pandas integer extension arrays,
--> 560         array = array.astype(dtype)
    561 
    562     if force_all_finite not in (True, False, 'allow-nan'):

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5875         else:
   5876             # else, only a single dtype is given
-> 5877             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5878             return self._constructor(new_data).__finalize__(self, method="astype")
   5879 

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    629         self, dtype, copy: bool = False, errors: str = "raise"
    630     ) -> "BlockManager":
--> 631         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    632 
    633     def convert(

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    425                     applied = b.apply(f, **kwargs)
    426                 else:
--> 427                     applied = getattr(b, f)(**kwargs)
    428             except (TypeError, NotImplementedError):
    429                 if not ignore_failures:

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    646         if self.is_extension:
    647             try:
--> 648                 values = self.values.astype(dtype)
    649             except (ValueError, TypeError):
    650                 if errors == "ignore":

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/string_.py in astype(self, dtype, copy)
    306             mask = self.isna()
    307             arr[mask] = 0
--> 308             values = arr.astype(dtype)
    309             values[mask] = np.nan
    310             return values

ValueError: could not convert string to float: '0000-0001-5001-4994'

Evaluation¶

BERT¶

In [30]:

# Language model Databunch
from fast_bert.data_lm import BertLMDataBunch
# Language model learner
from fast_bert.learner_lm import BertLMLearner

from pathlib import Path
from box import Box

import logging
logger = logging.getLogger()

In [34]:

# Box is a nice wrapper to create an object from a json dict
args = Box({
    "seed": 42,
    "task_name": 'imdb_reviews_lm',
    "model_name": 'roberta-base',
    "model_type": 'roberta',
    "train_batch_size": 16,
    "learning_rate": 4e-5,
    "num_train_epochs": 20,
    "fp16": True,
    "fp16_opt_level": "O2",
    "warmup_steps": 1000,
    "logging_steps": 0,
    "max_seq_length": 512,
    "multi_gpu": False
})

DATA_PATH = Path('../data/processed')
LOG_PATH = Path('../logs')
MODEL_PATH = Path('../models/lm_model_{}/'.format(args.model_type))

DATA_PATH.mkdir(exist_ok=True)
MODEL_PATH.mkdir(exist_ok=True)
LOG_PATH.mkdir(exist_ok=True)

In [35]:

databunch_lm = BertLMDataBunch.from_raw_corpus(
    data_dir=DATA_PATH,
    text_list=df['concat'],
    tokenizer=args.model_name,
    batch_size_per_gpu=args.train_batch_size,
    max_seq_length=args.max_seq_length,
    multi_gpu=args.multi_gpu,
    model_type=args.model_type,
    logger=logger)

1.27% [123222/9672336 00:03<04:04]

---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-35-702486b1b84c> in <module>
----> 1 databunch_lm = BertLMDataBunch.from_raw_corpus(
      2     data_dir=DATA_PATH,
      3     text_list=df['concat'],
      4     tokenizer=args.model_name,
      5     batch_size_per_gpu=args.train_batch_size,

~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py in from_raw_corpus(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)
    191         )
    192         # Create train corpus
--> 193         create_corpus(train_list, str(data_dir / train_file), logger=logger)
    194 
    195         # Create val corpus

~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py in create_corpus(text_list, target_path, logger)
     87             text = text.strip()
     88 
---> 89             f.write(text + "\n")
     90 
     91 

UnicodeEncodeError: 'utf-8' codec can't encode characters in position 162-163: surrogates not allowed

51 KiB Raw Blame History Unescape Escape