55 KiB
55 KiB
In [2]:
import pickle5 as pickle
import numpy as np
from IPython.display import display, Markdown
import pandas as pd
import tldextract
In [3]:
def get_ratio(cluster, n, exp):
records = exp[exp['orcid'].isin(cluster[n])]
grouped = records.groupby('url_domains').count().sort_values('orcid', ascending=False)
last_percentile = grouped['orcid'].quantile(0.99)
last_grouped = grouped[grouped['orcid'] >= last_percentile]
ratio = last_grouped['orcid'].mean()/ len(records)
return ratio
In [4]:
def output_cluster(cluster, n, exp, excluded_domains):
records = exp[exp['orcid'].isin(cluster[n])]
grouped = records.groupby('url_domains').count().sort_values('orcid', ascending=False)
# grouped = records[~records['url_domains'].str.contains('|'.join(legit_domains))].groupby('url_domains').count().sort_values('orcid', ascending=False)
grouped['norm'] = (1+grouped['orcid'])/2 # (-1,1] -> (0,1]
grouped['lognorm'] = np.log(grouped['norm'])
half_percentile = grouped['orcid'].quantile(0.5)
last_percentile = grouped['orcid'].quantile(0.99)
last_grouped = grouped[grouped['orcid'] >= last_percentile]
others_grouped = grouped[grouped['orcid'] < last_percentile]
half_grouped = grouped[grouped['orcid'] <= half_percentile]
ratio = last_grouped['orcid'].mean()/ len(records)
if ratio < 0.85:
legit = True
else:
legit = False
if legit:
text = '<span style="color:green">Legit<span>'
else:
text = '<span style="color:red">Illegit<span>'
display(Markdown('### Cluster ' + str(n) + ' - ' + text))
display(Markdown('Cluster cardinality: ' + str(len(cluster[n]))))
display(Markdown('------------'))
display(Markdown('#### Domains summary'))
print(grouped[['orcid']])
display(Markdown('------------'))
display(Markdown('#### Last percentile stats (freq >=' + str(last_percentile) + ')'))
display(Markdown('Ratio ' + str(ratio)))
# display(Markdown('------------'))
# display(Markdown('#### Global stats'))
# display(Markdown('Skewness ' + str(grouped['lognorm'].skew())))
# display(Markdown('Kurtosis ' + str(grouped['lognorm'].kurt())))
# display(Markdown('Variance ' + str(grouped['lognorm'].var())))
# display(Markdown('Standard Deviation ' + str(grouped['lognorm'].std())))
# display(Markdown('------------'))
random_examples = records.sample(10)
display(Markdown('#### 10 random examples'))
for v in random_examples['orcid']:
print('https://orcid.org/' + v)
In [5]:
def extract_domain(link):
return tldextract.extract(link).registered_domain
grid_df = pd.read_csv('data/grid/links.csv', index_col='grid_id')
grid_df['domain'] = grid_df.link.apply(extract_domain)
legit_domains = ['instagram.com', 'linkedin.com', 'twitter.com' 'youtube.com', 'researchgate.net', 'academia.edu', 'publons.com', 'github.io', 'github.com', 'scopus.com', 'researcherid.com', 'google', 'vub.be', '.ac.uk', 'goo.gl', 'elsevier.com', 'ssrn.com', 'elibrary.ru', 'google.co', '.gov', 'researchmap.jp', 'ucviden.dk']
excluded_domains = list(grid_df['domain'].values) + legit_domains
In [6]:
clusters = pickle.load(open('data/clusters.obj', 'rb'))
exp = pickle.load(open('data/exploded.obj', 'rb'))
In [7]:
output_cluster(clusters, 0, exp, excluded_domains)
In [27]:
output_cluster(clusters, 5, exp, excluded_domains)
In [8]:
output_cluster(clusters, 10, exp, excluded_domains)
In [22]:
output_cluster(clusters, 12, exp, excluded_domains)
In [9]:
output_cluster(clusters, 15, exp, excluded_domains)
In [10]:
output_cluster(clusters, 25, exp, excluded_domains)
In [11]:
output_cluster(clusters, 35, exp, excluded_domains)
In [12]:
output_cluster(clusters, 70, exp, excluded_domains)
In [13]:
output_cluster(clusters, 250, exp, excluded_domains)
In [14]:
output_cluster(clusters, 315, exp, excluded_domains)
In [15]:
output_cluster(clusters, 400, exp, excluded_domains)
In [16]:
output_cluster(clusters, 500, exp, excluded_domains)
In [17]:
output_cluster(clusters, 600, exp, excluded_domains)
In [18]:
output_cluster(clusters, 900, exp, excluded_domains)
In [19]:
output_cluster(clusters, 1000, exp, excluded_domains)
In [20]:
total = 0
for i in range(0, len(clusters)):
r = get_ratio(clusters, i, exp)
if r > 0.85:
total += len(clusters[i])
print(total)
In [21]:
total_more_than_10 = 0
for i in range(0, len(clusters)):
if len(clusters[i]) > 10:
r = get_ratio(clusters, i, exp)
if r > 0.85:
total_more_than_10 += len(clusters[i])
print(total_more_than_10)