You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

51 KiB

Dataset preprocessing

In [49]:
import pandas as pd
import ast
import tldextract
import numpy
In [50]:
# Notable Solid ORCID iDs for debug purposes
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
In [51]:
# Notable fake ORCID iDs for debug purposes
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
In [52]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header=0,
                         dtype = {'orcid': pd.StringDtype(), 
                                  'claimed': bool, 
                                  'verifyed email': bool, 
                                  'verified primary email': bool,
                                  'given names': pd.StringDtype(),
                                  'family name': pd.StringDtype(),
                                  'biography': pd.StringDtype(),
                                  'other names': pd.StringDtype(),
                                  'researcher urls': pd.StringDtype(),
                                  'primary email': pd.StringDtype(),
                                  'other emails': pd.StringDtype(),
                                  'keywords': pd.StringDtype(),
                                  'external identifiers': pd.StringDtype(),
                                  'education': pd.StringDtype(),
                                  'employments': pd.StringDtype(),
                                  'number of works': pd.Int16Dtype(),
                                  'works source': pd.StringDtype()})
In [53]:
df['given names'] = df['given names'].fillna('')
df['family name'] = df['family name'].fillna('')
df['biography'] = df['biography'].fillna('')
df['primary email'] = df['primary email'].fillna('')
In [54]:
df['other names'] = df['other names'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [55]:
df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [59]:
df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))
In [57]:
df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [58]:
def extract_url_domains(lst):
    domains = []
    for e in lst:
        # e[0] is a string describing the url
        # e[1] is the url
        domain = tldextract.extract(e[1])
        domains.append(domain.registered_domain)
    return domains
In [60]:
df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [61]:
def extract_education(lst):
    educations = []
    for e in lst:
        # e[0] degree
        # e[1] role
        # e[2] university
        # e[..] city, region, country, id, id_scheme
        educations.append(' '.join([e[0], e[1], e[2]]))
    return educations
In [62]:
df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [63]:
def extract_employment(lst):
    res = []
    for e in lst:
        # e[0] role
        # e[1] institute
        # e[..] city, region, country, id, id_scheme
        res.append(' '.join([e[0], e[1]]))
    return res
In [56]:
df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [64]:
def extract_email_domains(lst):
    res = []
    for email in lst:
        res.append(email.replace('@', ' '))
    return res
In [81]:
df['concat'] =  df['given names'] + ' ' + df['family name'] + '\n' + \
                df['other names'].apply(lambda x: ' '.join(x)) + '\n' + \
                df['primary email'].values[0].replace('@', ' ') + '\n' + \
                df['other emails'].apply(lambda x: ' '.join(extract_email_domains(x))) + '\n' + \
                df['biography'] + '\n' + \
                df['keywords'].apply(lambda x: ' - '.join(x)) + '\n' + \
                df['url_domains'].apply(lambda x: ' '.join(x)) + '\n' + \
                df['education'].apply(lambda x: '\n'.join(extract_education(x))) + '\n' + \
                df['employments'].apply(lambda x: '\n'.join(extract_employment(x)))
In [82]:
print(df[df['orcid'] == AM]['concat'].values[0])
Andrea Mannocci




Data science  - science of science - scholarly knowledge mining - open science - research infrastructures
github.io twitter.com linkedin.com
Information engineering Ph.D. Università degli Studi di Pisa
Telematics engineering M.Sc. Universidad Carlos III de Madrid
Computer engineering M.Sc. Università degli Studi di Pisa
Computer engineering B.Sc. Università degli Studi di Pisa
Research Associate Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche
Research Associate The Open University
Research assistant Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche
Research assistant IMDEA Networks
Research assistant Syddansk Universitet
In [19]:
df.loc[df['number of works'] > 0, 'label'] = True
In [20]:
df[df['orcid'] == AM]
Out[20]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers education employments number of works works source url_domains concat label
8840413 0000-0002-5193-7851 True True True Andrea Mannocci [] [[Personal website, https://andremann.github.i... andrea.mannocci@isti.cnr.it [] [Data science , science of science, scholarly ... "Scopus Author ID", "55233589900" [[Information engineering, Ph.D., Università d... [[Research Associate, Istituto di Scienza e Te... 37 ["Scopus - Elsevier", "Crossref Metadata Searc... [github.io, twitter.com, linkedin.com] 0000-0002-5193-7851 Andrea Mannocci andrea.ma... True
In [21]:
df[df['orcid'] == SCAFFOLD]
Out[21]:
orcid claimed verifyed email verified primary email given names family name biography other names researcher urls primary email other emails keywords external identifiers education employments number of works works source url_domains concat label
14 0000-0001-5004-7761 True True True scaffolding hire [The first feature that you have to check in t... [[scaffolding hire Wellington, https://www.tig... [] [scaffolding hire Wellington] <NA> [] [] 0 <NA> [tigerscaffolds.co.nz] 0000-0001-5004-7761 scaffolding hire The first... NaN
In [25]:
df = df[['orcid', 'concat', 'label']]
df
Out[25]:
concat label
0 0000-0001-5000-2053 Jorge Jaramillo Sanchez ... NaN
1 0000-0001-5000-6548 Wiseman Bekelesi NaN
2 0000-0001-5000-7962 ALICE INDIMULI NaN
3 0000-0001-5000-8586 shim ji yun NaN
4 0000-0001-5001-0256 Sandro Caramaschi NaN
... ... ...
10747035 0000-0003-4998-1551 Animesh Ghosh NaN
10747036 0000-0003-4998-4111 Hawa Liberna NaN
10747037 0000-0003-4998-6045 Tongyi Men NaN
10747038 0000-0003-4998-8868 Charldon Wilken NaN
10747039 0000-0003-4999-7916 Tapas Bapu B.R. NaN

10747040 rows × 2 columns

Pre-trained spam filter as-is

In [83]:
import string
import torch
import transformers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-83-72f6535bcb9f> in <module>
      4 import numpy as np # linear algebra
      5 import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
----> 6 import seaborn as sns
      7 import matplotlib.pyplot as plt
      8 from nltk.corpus import stopwords

ModuleNotFoundError: No module named 'seaborn'

One-Class SVM

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
In [44]:
samples = df[df['label'] == True]
vectorizer = TfidfVectorizer()
samples['features'] = vectorizer.fit_transform(samples['concat'])
<ipython-input-44-8393c5b42618>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['features'] = vectorizer.fit_transform(samples['concat'])
In [46]:
samples['features']
Out[46]:
5             (0, 983769)\t0.04916990678988556\n  (0, 1177...
13            (0, 983769)\t0.04916990678988556\n  (0, 1177...
24            (0, 983769)\t0.04916990678988556\n  (0, 1177...
26            (0, 983769)\t0.04916990678988556\n  (0, 1177...
29            (0, 983769)\t0.04916990678988556\n  (0, 1177...
                                  ...                        
10747024      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747026      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747027      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747030      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747034      (0, 983769)\t0.04916990678988556\n  (0, 1177...
Name: features, Length: 2674451, dtype: object
In [ ]:
trainX, testX, trainy, testy = train_test_split(samples['features'], samples['label'], test_size=0.7, random_state=3, stratify=y)
In [ ]:
model = OneClassSVM(gamma='scale', nu=0.01)
model.fit(trainX)

Training

In [118]:
model.fit(df[df['label'] == True])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-118-78458a59bca9> in <module>
----> 1 model.fit(df[df['label'] == True])

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_classes.py in fit(self, X, y, sample_weight, **params)
   1374 
   1375         """
-> 1376         super().fit(X, np.ones(_num_samples(X)),
   1377                     sample_weight=sample_weight, **params)
   1378         self.offset_ = -self._intercept_

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_base.py in fit(self, X, y, sample_weight)
    167             check_consistent_length(X, y)
    168         else:
--> 169             X, y = self._validate_data(X, y, dtype=np.float64,
    170                                        order='C', accept_sparse='csr',
    171                                        accept_large_sparse=False)

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    431                 y = check_array(y, **check_y_params)
    432             else:
--> 433                 X, y = check_X_y(X, y, **check_params)
    434             out = X, y
    435 

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    812         raise ValueError("y cannot be None")
    813 
--> 814     X = check_array(X, accept_sparse=accept_sparse,
    815                     accept_large_sparse=accept_large_sparse,
    816                     dtype=dtype, order=order, copy=copy,

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    558     if has_pd_integer_array:
    559         # If there are any pandas integer extension arrays,
--> 560         array = array.astype(dtype)
    561 
    562     if force_all_finite not in (True, False, 'allow-nan'):

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5875         else:
   5876             # else, only a single dtype is given
-> 5877             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5878             return self._constructor(new_data).__finalize__(self, method="astype")
   5879 

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    629         self, dtype, copy: bool = False, errors: str = "raise"
    630     ) -> "BlockManager":
--> 631         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    632 
    633     def convert(

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    425                     applied = b.apply(f, **kwargs)
    426                 else:
--> 427                     applied = getattr(b, f)(**kwargs)
    428             except (TypeError, NotImplementedError):
    429                 if not ignore_failures:

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    646         if self.is_extension:
    647             try:
--> 648                 values = self.values.astype(dtype)
    649             except (ValueError, TypeError):
    650                 if errors == "ignore":

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/string_.py in astype(self, dtype, copy)
    306             mask = self.isna()
    307             arr[mask] = 0
--> 308             values = arr.astype(dtype)
    309             values[mask] = np.nan
    310             return values

ValueError: could not convert string to float: '0000-0001-5001-4994'

Evaluation

BERT

In [30]:
# Language model Databunch
from fast_bert.data_lm import BertLMDataBunch
# Language model learner
from fast_bert.learner_lm import BertLMLearner

from pathlib import Path
from box import Box

import logging
logger = logging.getLogger()
In [34]:
# Box is a nice wrapper to create an object from a json dict
args = Box({
    "seed": 42,
    "task_name": 'imdb_reviews_lm',
    "model_name": 'roberta-base',
    "model_type": 'roberta',
    "train_batch_size": 16,
    "learning_rate": 4e-5,
    "num_train_epochs": 20,
    "fp16": True,
    "fp16_opt_level": "O2",
    "warmup_steps": 1000,
    "logging_steps": 0,
    "max_seq_length": 512,
    "multi_gpu": False
})

DATA_PATH = Path('../data/processed')
LOG_PATH = Path('../logs')
MODEL_PATH = Path('../models/lm_model_{}/'.format(args.model_type))

DATA_PATH.mkdir(exist_ok=True)
MODEL_PATH.mkdir(exist_ok=True)
LOG_PATH.mkdir(exist_ok=True)
In [35]:
databunch_lm = BertLMDataBunch.from_raw_corpus(
    data_dir=DATA_PATH,
    text_list=df['concat'],
    tokenizer=args.model_name,
    batch_size_per_gpu=args.train_batch_size,
    max_seq_length=args.max_seq_length,
    multi_gpu=args.multi_gpu,
    model_type=args.model_type,
    logger=logger)
1.27% [123222/9672336 00:03<04:04]
---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-35-702486b1b84c> in <module>
----> 1 databunch_lm = BertLMDataBunch.from_raw_corpus(
      2     data_dir=DATA_PATH,
      3     text_list=df['concat'],
      4     tokenizer=args.model_name,
      5     batch_size_per_gpu=args.train_batch_size,

~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py in from_raw_corpus(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)
    191         )
    192         # Create train corpus
--> 193         create_corpus(train_list, str(data_dir / train_file), logger=logger)
    194 
    195         # Create val corpus

~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py in create_corpus(text_list, target_path, logger)
     87             text = text.strip()
     88 
---> 89             f.write(text + "\n")
     90 
     91 

UnicodeEncodeError: 'utf-8' codec can't encode characters in position 162-163: surrogates not allowed