51 KiB
51 KiB
Dataset preprocessing¶
In [49]:
import pandas as pd
import ast
import tldextract
import numpy
In [50]:
# Notable Solid ORCID iDs for debug purposes
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
In [51]:
# Notable fake ORCID iDs for debug purposes
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
In [52]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header=0,
dtype = {'orcid': pd.StringDtype(),
'claimed': bool,
'verifyed email': bool,
'verified primary email': bool,
'given names': pd.StringDtype(),
'family name': pd.StringDtype(),
'biography': pd.StringDtype(),
'other names': pd.StringDtype(),
'researcher urls': pd.StringDtype(),
'primary email': pd.StringDtype(),
'other emails': pd.StringDtype(),
'keywords': pd.StringDtype(),
'external identifiers': pd.StringDtype(),
'education': pd.StringDtype(),
'employments': pd.StringDtype(),
'number of works': pd.Int16Dtype(),
'works source': pd.StringDtype()})
In [53]:
df['given names'] = df['given names'].fillna('')
df['family name'] = df['family name'].fillna('')
df['biography'] = df['biography'].fillna('')
df['primary email'] = df['primary email'].fillna('')
In [54]:
df['other names'] = df['other names'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [55]:
df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [59]:
df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))
In [57]:
df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [58]:
def extract_url_domains(lst):
domains = []
for e in lst:
# e[0] is a string describing the url
# e[1] is the url
domain = tldextract.extract(e[1])
domains.append(domain.registered_domain)
return domains
In [60]:
df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [61]:
def extract_education(lst):
educations = []
for e in lst:
# e[0] degree
# e[1] role
# e[2] university
# e[..] city, region, country, id, id_scheme
educations.append(' '.join([e[0], e[1], e[2]]))
return educations
In [62]:
df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [63]:
def extract_employment(lst):
res = []
for e in lst:
# e[0] role
# e[1] institute
# e[..] city, region, country, id, id_scheme
res.append(' '.join([e[0], e[1]]))
return res
In [56]:
df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))
In [64]:
def extract_email_domains(lst):
res = []
for email in lst:
res.append(email.replace('@', ' '))
return res
In [81]:
df['concat'] = df['given names'] + ' ' + df['family name'] + '\n' + \
df['other names'].apply(lambda x: ' '.join(x)) + '\n' + \
df['primary email'].values[0].replace('@', ' ') + '\n' + \
df['other emails'].apply(lambda x: ' '.join(extract_email_domains(x))) + '\n' + \
df['biography'] + '\n' + \
df['keywords'].apply(lambda x: ' - '.join(x)) + '\n' + \
df['url_domains'].apply(lambda x: ' '.join(x)) + '\n' + \
df['education'].apply(lambda x: '\n'.join(extract_education(x))) + '\n' + \
df['employments'].apply(lambda x: '\n'.join(extract_employment(x)))
In [82]:
print(df[df['orcid'] == AM]['concat'].values[0])
In [19]:
df.loc[df['number of works'] > 0, 'label'] = True
In [20]:
df[df['orcid'] == AM]
Out[20]:
In [21]:
df[df['orcid'] == SCAFFOLD]
Out[21]:
In [25]:
df = df[['orcid', 'concat', 'label']]
df
Out[25]:
Pre-trained spam filter as-is¶
In [83]:
import string
import torch
import transformers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
One-Class SVM¶
In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
In [44]:
samples = df[df['label'] == True]
vectorizer = TfidfVectorizer()
samples['features'] = vectorizer.fit_transform(samples['concat'])
In [46]:
samples['features']
Out[46]:
In [ ]:
trainX, testX, trainy, testy = train_test_split(samples['features'], samples['label'], test_size=0.7, random_state=3, stratify=y)
In [ ]:
model = OneClassSVM(gamma='scale', nu=0.01)
model.fit(trainX)
Training¶
In [118]:
model.fit(df[df['label'] == True])
Evaluation¶
BERT¶
In [30]:
# Language model Databunch
from fast_bert.data_lm import BertLMDataBunch
# Language model learner
from fast_bert.learner_lm import BertLMLearner
from pathlib import Path
from box import Box
import logging
logger = logging.getLogger()
In [34]:
# Box is a nice wrapper to create an object from a json dict
args = Box({
"seed": 42,
"task_name": 'imdb_reviews_lm',
"model_name": 'roberta-base',
"model_type": 'roberta',
"train_batch_size": 16,
"learning_rate": 4e-5,
"num_train_epochs": 20,
"fp16": True,
"fp16_opt_level": "O2",
"warmup_steps": 1000,
"logging_steps": 0,
"max_seq_length": 512,
"multi_gpu": False
})
DATA_PATH = Path('../data/processed')
LOG_PATH = Path('../logs')
MODEL_PATH = Path('../models/lm_model_{}/'.format(args.model_type))
DATA_PATH.mkdir(exist_ok=True)
MODEL_PATH.mkdir(exist_ok=True)
LOG_PATH.mkdir(exist_ok=True)
In [35]:
databunch_lm = BertLMDataBunch.from_raw_corpus(
data_dir=DATA_PATH,
text_list=df['concat'],
tokenizer=args.model_name,
batch_size_per_gpu=args.train_batch_size,
max_seq_length=args.max_seq_length,
multi_gpu=args.multi_gpu,
model_type=args.model_type,
logger=logger)