# Dataset preprocessing

In [49]:
import pandas as pd
import ast
import tldextract
import numpy

In [50]:
# Notable Solid ORCID iDs for debug purposes
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'


In [51]:
# Notable fake ORCID iDs for debug purposes
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'


In [52]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header=0,
                         dtype = {'orcid': pd.StringDtype(), 
                                  'claimed': bool, 
                                  'verifyed email': bool, 
                                  'verified primary email': bool,
                                  'given names': pd.StringDtype(),
                                  'family name': pd.StringDtype(),
                                  'biography': pd.StringDtype(),
                                  'other names': pd.StringDtype(),
                                  'researcher urls': pd.StringDtype(),
                                  'primary email': pd.StringDtype(),
                                  'other emails': pd.StringDtype(),
                                  'keywords': pd.StringDtype(),
                                  'external identifiers': pd.StringDtype(),
                                  'education': pd.StringDtype(),
                                  'employments': pd.StringDtype(),
                                  'number of works': pd.Int16Dtype(),
                                  'works source': pd.StringDtype()})

In [53]:
df['given names'] = df['given names'].fillna('')
df['family name'] = df['family name'].fillna('')
df['biography'] = df['biography'].fillna('')
df['primary email'] = df['primary email'].fillna('')

In [54]:
df['other names'] = df['other names'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [55]:
df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [59]:
df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))

In [57]:
df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [58]:
def extract_url_domains(lst):
    domains = []
    for e in lst:
        # e[0] is a string describing the url
        # e[1] is the url
        domain = tldextract.extract(e[1])
        domains.append(domain.registered_domain)
    return domains

In [60]:
df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [61]:
def extract_education(lst):
    educations = []
    for e in lst:
        # e[0] degree
        # e[1] role
        # e[2] university
        # e[..] city, region, country, id, id_scheme
        educations.append(' '.join([e[0], e[1], e[2]]))
    return educations

In [62]:
df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [63]:
def extract_employment(lst):
    res = []
    for e in lst:
        # e[0] role
        # e[1] institute
        # e[..] city, region, country, id, id_scheme
        res.append(' '.join([e[0], e[1]]))
    return res

In [56]:
df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))

In [64]:
def extract_email_domains(lst):
    res = []
    for email in lst:
        res.append(email.replace('@', ' '))
    return res

In [81]:
df['concat'] =  df['given names'] + ' ' + df['family name'] + '\n' + \
                df['other names'].apply(lambda x: ' '.join(x)) + '\n' + \
                df['primary email'].values[0].replace('@', ' ') + '\n' + \
                df['other emails'].apply(lambda x: ' '.join(extract_email_domains(x))) + '\n' + \
                df['biography'] + '\n' + \
                df['keywords'].apply(lambda x: ' - '.join(x)) + '\n' + \
                df['url_domains'].apply(lambda x: ' '.join(x)) + '\n' + \
                df['education'].apply(lambda x: '\n'.join(extract_education(x))) + '\n' + \
                df['employments'].apply(lambda x: '\n'.join(extract_employment(x)))

In [82]:
print(df[df['orcid'] == AM]['concat'].values[0])

Andrea Mannocci




Data science  - science of science - scholarly knowledge mining - open science - research infrastructures
github.io twitter.com linkedin.com
Information engineering Ph.D. Università degli Studi di Pisa
Telematics engineering M.Sc. Universidad Carlos III de Madrid
Computer engineering M.Sc. Università degli Studi di Pisa
Computer engineering B.Sc. Università degli Studi di Pisa
Research Associate Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche
Research Associate The Open University
Research assistant Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche
Research assistant IMDEA Networks
Research assistant Syddansk Universitet


In [19]:
df.loc[df['number of works'] > 0, 'label'] = True

In [20]:
df[df['orcid'] == AM]

Unnamed: 0,orcid,claimed,verifyed email,verified primary email,given names,family name,biography,other names,researcher urls,primary email,other emails,keywords,external identifiers,education,employments,number of works,works source,url_domains,concat,label
8840413,0000-0002-5193-7851,True,True,True,Andrea,Mannocci,,[],"[[Personal website, https://andremann.github.i...",andrea.mannocci@isti.cnr.it,[],"[Data science , science of science, scholarly ...","[[""Scopus Author ID"", ""55233589900""]]","[[Information engineering, Ph.D., Università d...","[[Research Associate, Istituto di Scienza e Te...",37,"[""Scopus - Elsevier"", ""Crossref Metadata Searc...","[github.io, twitter.com, linkedin.com]",0000-0002-5193-7851 Andrea Mannocci andrea.ma...,True


In [21]:
df[df['orcid'] == SCAFFOLD]

Unnamed: 0,orcid,claimed,verifyed email,verified primary email,given names,family name,biography,other names,researcher urls,primary email,other emails,keywords,external identifiers,education,employments,number of works,works source,url_domains,concat,label
14,0000-0001-5004-7761,True,True,True,scaffolding,hire,,[The first feature that you have to check in t...,"[[scaffolding hire Wellington, https://www.tig...",,[],[scaffolding hire Wellington],,[],[],0,,[tigerscaffolds.co.nz],0000-0001-5004-7761 scaffolding hire The first...,


In [25]:
df = df[['orcid', 'concat', 'label']]
df

Unnamed: 0,concat,label
0,0000-0001-5000-2053 Jorge Jaramillo Sanchez ...,
1,0000-0001-5000-6548 Wiseman Bekelesi,
2,0000-0001-5000-7962 ALICE INDIMULI,
3,0000-0001-5000-8586 shim ji yun,
4,0000-0001-5001-0256 Sandro Caramaschi,
...,...,...
10747035,0000-0003-4998-1551 Animesh Ghosh,
10747036,0000-0003-4998-4111 Hawa Liberna,
10747037,0000-0003-4998-6045 Tongyi Men,
10747038,0000-0003-4998-8868 Charldon Wilken,


# Pre-trained spam filter as-is

In [83]:
import string
import torch
import transformers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

ModuleNotFoundError: No module named 'seaborn'

# One-Class SVM

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split


In [44]:
samples = df[df['label'] == True]
vectorizer = TfidfVectorizer()
samples['features'] = vectorizer.fit_transform(samples['concat'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['features'] = vectorizer.fit_transform(samples['concat'])


In [46]:
samples['features']

5             (0, 983769)\t0.04916990678988556\n  (0, 1177...
13            (0, 983769)\t0.04916990678988556\n  (0, 1177...
24            (0, 983769)\t0.04916990678988556\n  (0, 1177...
26            (0, 983769)\t0.04916990678988556\n  (0, 1177...
29            (0, 983769)\t0.04916990678988556\n  (0, 1177...
                                  ...                        
10747024      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747026      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747027      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747030      (0, 983769)\t0.04916990678988556\n  (0, 1177...
10747034      (0, 983769)\t0.04916990678988556\n  (0, 1177...
Name: features, Length: 2674451, dtype: object

In [None]:
trainX, testX, trainy, testy = train_test_split(samples['features'], samples['label'], test_size=0.7, random_state=3, stratify=y)

In [None]:
model = OneClassSVM(gamma='scale', nu=0.01)
model.fit(trainX)

## Training

In [118]:
model.fit(df[df['label'] == True])

ValueError: could not convert string to float: '0000-0001-5001-4994'

## Evaluation

# BERT

In [30]:
# Language model Databunch
from fast_bert.data_lm import BertLMDataBunch
# Language model learner
from fast_bert.learner_lm import BertLMLearner

from pathlib import Path
from box import Box

import logging
logger = logging.getLogger()


In [34]:
# Box is a nice wrapper to create an object from a json dict
args = Box({
    "seed": 42,
    "task_name": 'imdb_reviews_lm',
    "model_name": 'roberta-base',
    "model_type": 'roberta',
    "train_batch_size": 16,
    "learning_rate": 4e-5,
    "num_train_epochs": 20,
    "fp16": True,
    "fp16_opt_level": "O2",
    "warmup_steps": 1000,
    "logging_steps": 0,
    "max_seq_length": 512,
    "multi_gpu": False
})

DATA_PATH = Path('../data/processed')
LOG_PATH = Path('../logs')
MODEL_PATH = Path('../models/lm_model_{}/'.format(args.model_type))

DATA_PATH.mkdir(exist_ok=True)
MODEL_PATH.mkdir(exist_ok=True)
LOG_PATH.mkdir(exist_ok=True)

In [35]:
databunch_lm = BertLMDataBunch.from_raw_corpus(
    data_dir=DATA_PATH,
    text_list=df['concat'],
    tokenizer=args.model_name,
    batch_size_per_gpu=args.train_batch_size,
    max_seq_length=args.max_seq_length,
    multi_gpu=args.multi_gpu,
    model_type=args.model_type,
    logger=logger)

UnicodeEncodeError: 'utf-8' codec can't encode characters in position 162-163: surrogates not allowed