In [2]:
import pickle5 as pickle
import numpy as np
from IPython.display import display, Markdown
import pandas as pd
import tldextract

In [3]:
def get_ratio(cluster, n, exp):
 records = exp[exp['orcid'].isin(cluster[n])]
 grouped = records.groupby('url_domains').count().sort_values('orcid', ascending=False)
 last_percentile = grouped['orcid'].quantile(0.99)
 
 last_grouped = grouped[grouped['orcid'] >= last_percentile]
 ratio = last_grouped['orcid'].mean()/ len(records)
 return ratio

In [4]:
def output_cluster(cluster, n, exp, excluded_domains):
 
 records = exp[exp['orcid'].isin(cluster[n])]
 
 grouped = records.groupby('url_domains').count().sort_values('orcid', ascending=False)
 # grouped = records[~records['url_domains'].str.contains('|'.join(legit_domains))].groupby('url_domains').count().sort_values('orcid', ascending=False)
 grouped['norm'] = (1+grouped['orcid'])/2 # (-1,1] -> (0,1]
 grouped['lognorm'] = np.log(grouped['norm'])
 
 half_percentile = grouped['orcid'].quantile(0.5)
 last_percentile = grouped['orcid'].quantile(0.99)
 
 last_grouped = grouped[grouped['orcid'] >= last_percentile]
 others_grouped = grouped[grouped['orcid'] < last_percentile]
 half_grouped = grouped[grouped['orcid'] <= half_percentile]
 
 
 ratio = last_grouped['orcid'].mean()/ len(records)
 if ratio < 0.85:
 legit = True
 else:
 legit = False
 
 if legit:
 text = 'Legit'
 else:
 text = 'Illegit'
 display(Markdown('### Cluster ' + str(n) + ' - ' + text))
 display(Markdown('Cluster cardinality: ' + str(len(cluster[n]))))
 display(Markdown('------------'))
 display(Markdown('#### Domains summary'))
 print(grouped[['orcid']])
 
 display(Markdown('------------'))
 display(Markdown('#### Last percentile stats (freq >=' + str(last_percentile) + ')'))
 display(Markdown('Ratio ' + str(ratio)))
 
# display(Markdown('------------'))
# display(Markdown('#### Global stats'))
# display(Markdown('Skewness ' + str(grouped['lognorm'].skew())))
# display(Markdown('Kurtosis ' + str(grouped['lognorm'].kurt())))
# display(Markdown('Variance ' + str(grouped['lognorm'].var())))
# display(Markdown('Standard Deviation ' + str(grouped['lognorm'].std())))
# display(Markdown('------------'))
 
 random_examples = records.sample(10)
 display(Markdown('#### 10 random examples'))
 for v in random_examples['orcid']:
 print('https://orcid.org/' + v)

In [5]:
def extract_domain(link):
 return tldextract.extract(link).registered_domain
grid_df = pd.read_csv('data/grid/links.csv', index_col='grid_id')
grid_df['domain'] = grid_df.link.apply(extract_domain)
legit_domains = ['instagram.com', 'linkedin.com', 'twitter.com' 'youtube.com', 'researchgate.net', 'academia.edu', 'publons.com', 'github.io', 'github.com', 'scopus.com', 'researcherid.com', 'google', 'vub.be', '.ac.uk', 'goo.gl', 'elsevier.com', 'ssrn.com', 'elibrary.ru', 'google.co', '.gov', 'researchmap.jp', 'ucviden.dk']
excluded_domains = list(grid_df['domain'].values) + legit_domains

In [6]:
clusters = pickle.load(open('data/clusters.obj', 'rb'))
exp = pickle.load(open('data/exploded.obj', 'rb'))

In [7]:
output_cluster(clusters, 0, exp, excluded_domains)

### Cluster 0 - Legit

Cluster cardinality: 8364

------------

#### Domains summary

 orcid
url_domains 
researchgate.net 3409
google.com 2113
linkedin.com 1850
publons.com 1491
academia.edu 1371
... ...
ipsa.org 1
iranpub.com 1
ircm.fr 1
ird.fr 1
zvdd.de 1

[4004 rows x 1 columns]


------------

#### Last percentile stats (freq >=105.9399999999996)

Ratio 0.014687716828684146

#### 10 random examples

https://orcid.org/0000-0002-7781-7161
https://orcid.org/0000-0001-5646-1007
https://orcid.org/0000-0002-8242-2295
https://orcid.org/0000-0003-2450-090X
https://orcid.org/0000-0003-0638-8555
https://orcid.org/0000-0003-1785-9201
https://orcid.org/0000-0001-6224-1430
https://orcid.org/0000-0001-6451-7584
https://orcid.org/0000-0002-2884-2504
https://orcid.org/0000-0001-8253-9461


In [27]:
output_cluster(clusters, 5, exp, excluded_domains)

### Cluster 5 - Legit

Cluster cardinality: 1147

------------

#### Domains summary

 orcid
url_domains 
bit.ly 839
tinyurl.com 181
researchgate.net 108
linkedin.com 81
seals.ac.za 77
... ...
handle.net 1
hamgardi.com 1
griffith.edu.au 1
gregnewkirk.com 1
liverpool.ac.uk 1

[532 rows x 1 columns]


------------

#### Last percentile stats (freq >=57.89999999999941)

Ratio 0.08075539568345323

#### 10 random examples

https://orcid.org/0000-0003-1300-8004
https://orcid.org/0000-0002-8269-4098
https://orcid.org/0000-0002-6689-4129
https://orcid.org/0000-0001-8304-8656
https://orcid.org/0000-0002-7273-8640
https://orcid.org/0000-0001-8046-1613
https://orcid.org/0000-0002-8116-9611
https://orcid.org/0000-0002-2728-7019
https://orcid.org/0000-0002-3318-3996
https://orcid.org/0000-0002-8493-0402


In [8]:
output_cluster(clusters, 10, exp, excluded_domains)

### Cluster 10 - Legit

Cluster cardinality: 768

------------

#### Domains summary

 orcid
url_domains 
10.10.70.176 448
tut.fi 182
vtt.fi 152
linkedin.com 35
researchgate.net 22
ohio-state.edu 22
tuni.fi 17
helsinki.fi 8
google.fi 6
wordpress.com 4
twitter.com 4
google.com 4
ramentor.com 3
vttresearch.com 2
hamk.fi 2
mendeley.com 2
github.com 2
google.com.ua 1
ufabc.edu.br 1
sustainablehousingdesign.com 1
trustnet.fi 1
blogspot.it 1
bio-complexity.com 1
astrocytenet.org 1
ucpori.fi 1
uni-stuttgart.de 1
sofiepelsmakers.com 1
upv.es 1
valentinalenarduzzi.it 1
arxiv.org 1
amichalas.com 1
w2e.fi 1
academia.edu 1
spinunit.eu 1
digitalhealthrevolution.fi 1
github.io 1
man.ac.uk 1
immersafe-itn.eu 1
ju.se 1
kannisto.org 1
ku.dk 1
full-parallax-imaging.eu 1
lobov.biz 1
aane.in 1
environmentaldesignpocketbook.com 1
modulight.com 1
nanocalibrate.eu 1
nordicsustainablearchitecture.com 1
ntnu.edu 1
etrovub.be 1
ponomarenko.info 1
yli-kaakinen.fi 1


------------

#### Last percentile stats (freq >=312.34000000000054)

Ratio 0.47157894736842104

#### 10 random examples

https://orcid.org/0000-0003-0493-7434
https://orcid.org/0000-0003-0062-8677
https://orcid.org/0000-0002-7097-1983
https://orcid.org/0000-0002-4587-6167
https://orcid.org/0000-0002-9059-8047
https://orcid.org/0000-0002-8158-556X
https://orcid.org/0000-0002-7574-4835
https://orcid.org/0000-0002-4958-8533
https://orcid.org/0000-0002-7953-1036
https://orcid.org/0000-0002-6508-065X


In [22]:
output_cluster(clusters, 12, exp, excluded_domains)

### Cluster 12 - Illegit

Cluster cardinality: 683

------------

#### Domains summary

 orcid
url_domains 
bravesites.com 628
4shared.com 57
itsmyurls.com 3
wordpress.com 3
blogspot.com 2
weebly.com 2
jimdo.com 2
jigsy.com 2
issuu.com 1
wixsite.com 1
wattpad.com 1
unblog.fr 1
tumblr.com 1
nesterheatingandair.com 1
beep.com 1
hubpages.com 1
behance.net 1
home.blog 1
ask.fm 1
exante-otzyvy.ru 1
etoysreview.com 1
doomby.com 1
disqus.com 1
digg.com 1
dailymotion.com 1
chatempleteas.com 1
google.com 1


------------

#### Last percentile stats (freq >=479.5399999999991)

Ratio 0.8746518105849582

#### 10 random examples

https://orcid.org/0000-0002-3418-013X
https://orcid.org/0000-0001-8597-176X
https://orcid.org/0000-0003-1977-6397
https://orcid.org/0000-0003-2942-4096
https://orcid.org/0000-0002-0561-4541
https://orcid.org/0000-0002-3891-5025
https://orcid.org/0000-0001-9898-4797
https://orcid.org/0000-0002-2805-556X
https://orcid.org/0000-0002-0591-8203
https://orcid.org/0000-0003-3544-5141


In [9]:
output_cluster(clusters, 15, exp, excluded_domains)

### Cluster 15 - Illegit

Cluster cardinality: 655

------------

#### Domains summary

 orcid
url_domains 
pbase.com 655


------------

#### Last percentile stats (freq >=655.0)

Ratio 1.0

#### 10 random examples

https://orcid.org/0000-0003-0526-0031
https://orcid.org/0000-0003-2459-0191
https://orcid.org/0000-0002-3503-1216
https://orcid.org/0000-0002-2448-3460
https://orcid.org/0000-0002-3749-9488
https://orcid.org/0000-0002-8148-1664
https://orcid.org/0000-0003-3531-2226
https://orcid.org/0000-0002-0740-8047
https://orcid.org/0000-0002-8080-8952
https://orcid.org/0000-0002-1595-5649


In [10]:
output_cluster(clusters, 25, exp, excluded_domains)

### Cluster 25 - Legit

Cluster cardinality: 602

------------

#### Domains summary

 orcid
url_domains 
elsevierpure.com 567
tue.nl 32
linkedin.com 30
sbg.ac.at 28
researchgate.net 20
... ...
hethongdien.info 1
hannahaugustin.at 1
growingupyolngu.com.au 1
graphicjustice.org 1
mathmods.eu 1

[129 rows x 1 columns]


------------

#### Last percentile stats (freq >=31.439999999999998)

Ratio 0.332039911308204

#### 10 random examples

https://orcid.org/0000-0002-1038-3530
https://orcid.org/0000-0002-8942-9510
https://orcid.org/0000-0002-2518-6847
https://orcid.org/0000-0002-6238-8042
https://orcid.org/0000-0002-4378-8059
https://orcid.org/0000-0001-9266-0126
https://orcid.org/0000-0002-2241-7953
https://orcid.org/0000-0002-8700-2767
https://orcid.org/0000-0002-1525-4615
https://orcid.org/0000-0002-5508-1130


In [11]:
output_cluster(clusters, 35, exp, excluded_domains)

### Cluster 35 - Illegit

Cluster cardinality: 548

------------

#### Domains summary

 orcid
url_domains 
lucialpiazzale.com 548


------------

#### Last percentile stats (freq >=548.0)

Ratio 1.0

#### 10 random examples

https://orcid.org/0000-0002-0302-8274
https://orcid.org/0000-0003-3462-9264
https://orcid.org/0000-0002-4155-518X
https://orcid.org/0000-0003-0801-5541
https://orcid.org/0000-0002-2022-5675
https://orcid.org/0000-0002-3068-5640
https://orcid.org/0000-0001-6164-449X
https://orcid.org/0000-0002-7691-3426
https://orcid.org/0000-0002-6459-0754
https://orcid.org/0000-0001-6723-0407


In [12]:
output_cluster(clusters, 70, exp, excluded_domains)

### Cluster 70 - Legit

Cluster cardinality: 252

------------

#### Domains summary

 orcid
url_domains 
blogspot.co.uk 117
nature.com 77
twitter.com 22
mypartnerforever.com 19
angelfire.com 17
... ...
google.com.co 1
guillermito2.net 1
hawaii.edu 1
heacademy.ac.uk 1
york.ac.uk 1

[183 rows x 1 columns]


------------

#### Last percentile stats (freq >=31.900000000000375)

Ratio 0.16275167785234898

#### 10 random examples

https://orcid.org/0000-0003-2448-9464
https://orcid.org/0000-0002-7154-7658
https://orcid.org/0000-0003-0984-9580
https://orcid.org/0000-0002-6792-7148
https://orcid.org/0000-0002-9845-6633
https://orcid.org/0000-0001-6845-7232
https://orcid.org/0000-0001-5400-2712
https://orcid.org/0000-0002-5765-9856
https://orcid.org/0000-0002-9394-5702
https://orcid.org/0000-0001-6995-0491


In [13]:
output_cluster(clusters, 250, exp, excluded_domains)

### Cluster 250 - Legit

Cluster cardinality: 44

------------

#### Domains summary

 orcid
url_domains 
theses.fr 31
archives-ouvertes.fr 9
academia.edu 7
emse.fr 7
linkedin.com 6
cairn.info 5
adum.fr 5
researchgate.net 3
hypotheses.org 2
twitter.com 2
mines-stetienne.fr 2
yannziegler.com 2
google.fr 2
casadevelazquez.org 2
romanistik.de 1
univ-evry.fr 1
u-paris.fr 1
u-bordeaux.fr 1
univ-montp3.fr 1
univ-reunion.fr 1
univ-valenciennes.fr 1
univ-lille.fr 1
cnrs.fr 1
revue-etr.org 1
publons.com 1
crec-paris3.fr 1
ksu.edu.sa 1
iptheologie.fr 1
inra.fr 1
iie.kz 1
google.com 1
google.co.in 1
gonthier-leguen.fr 1
mom.fr 1


------------

#### Last percentile stats (freq >=23.740000000000038)

Ratio 0.29523809523809524

#### 10 random examples

https://orcid.org/0000-0003-1402-4356
https://orcid.org/0000-0002-4162-4288
https://orcid.org/0000-0002-4162-4288
https://orcid.org/0000-0003-0174-4442
https://orcid.org/0000-0002-2959-3984
https://orcid.org/0000-0003-0750-9881
https://orcid.org/0000-0002-9057-4045
https://orcid.org/0000-0002-2471-7028
https://orcid.org/0000-0001-9785-6697
https://orcid.org/0000-0003-0174-4442


In [14]:
output_cluster(clusters, 315, exp, excluded_domains)

### Cluster 315 - Legit

Cluster cardinality: 37

------------

#### Domains summary

 orcid
url_domains 
um.edu.mo 37
academia.edu 2
google.com 2
google.pt 1
hongcaizhang.com 1
researchgate.net 1
xhsysu.edu.cn 1


------------

#### Last percentile stats (freq >=34.89999999999998)

Ratio 0.8222222222222222

#### 10 random examples

https://orcid.org/0000-0002-1201-9564
https://orcid.org/0000-0003-2762-7543
https://orcid.org/0000-0003-4801-6354
https://orcid.org/0000-0002-6300-1575
https://orcid.org/0000-0003-1723-1748
https://orcid.org/0000-0003-3659-1917
https://orcid.org/0000-0003-0832-0263
https://orcid.org/0000-0002-8294-6419
https://orcid.org/0000-0001-9403-7346
https://orcid.org/0000-0001-5449-063X


In [15]:
output_cluster(clusters, 400, exp, excluded_domains)

### Cluster 400 - Illegit

Cluster cardinality: 30

------------

#### Domains summary

 orcid
url_domains 
symulatorypc.pl 30


------------

#### Last percentile stats (freq >=30.0)

Ratio 1.0

#### 10 random examples

https://orcid.org/0000-0001-7264-4006
https://orcid.org/0000-0001-8048-0654
https://orcid.org/0000-0003-1505-2172
https://orcid.org/0000-0002-2800-9168
https://orcid.org/0000-0001-5894-8644
https://orcid.org/0000-0002-0454-8765
https://orcid.org/0000-0001-7836-227X
https://orcid.org/0000-0002-2492-5767
https://orcid.org/0000-0003-2223-0707
https://orcid.org/0000-0002-1379-3578


In [16]:
output_cluster(clusters, 500, exp, excluded_domains)

### Cluster 500 - Illegit

Cluster cardinality: 24

------------

#### Domains summary

 orcid
url_domains 
haodf.com 24
linkedin.com 1
researchgate.net 1


------------

#### Last percentile stats (freq >=23.54)

Ratio 0.9230769230769231

#### 10 random examples

https://orcid.org/0000-0003-3967-1269
https://orcid.org/0000-0002-0272-493X
https://orcid.org/0000-0001-8459-742X
https://orcid.org/0000-0003-1952-6734
https://orcid.org/0000-0002-6959-5351
https://orcid.org/0000-0002-1260-4261
https://orcid.org/0000-0002-0348-9732
https://orcid.org/0000-0002-7528-3274
https://orcid.org/0000-0001-7493-607X
https://orcid.org/0000-0002-7865-0359


In [17]:
output_cluster(clusters, 600, exp, excluded_domains)

### Cluster 600 - Legit

Cluster cardinality: 19

------------

#### Domains summary

 orcid
url_domains 
nki.nl 19
linkedin.com 4
acsitefactory.com 1
cancerresearchuk.org 1
clinicaltrials.gov 1
dcisprecision.org 1
google.co.uk 1
google.com 1
researchgate.net 1
tushartomar.com 1
twitter.com 1


------------

#### Last percentile stats (freq >=17.500000000000007)

Ratio 0.59375

#### 10 random examples

https://orcid.org/0000-0002-8940-2676
https://orcid.org/0000-0002-3344-7605
https://orcid.org/0000-0001-5265-3407
https://orcid.org/0000-0002-8940-2676
https://orcid.org/0000-0001-6514-4767
https://orcid.org/0000-0003-1743-6428
https://orcid.org/0000-0001-6180-0632
https://orcid.org/0000-0003-1743-6428
https://orcid.org/0000-0002-3344-7605
https://orcid.org/0000-0001-5989-289X


In [18]:
output_cluster(clusters, 900, exp, excluded_domains)

### Cluster 900 - Legit

Cluster cardinality: 13

------------

#### Domains summary

 orcid
url_domains 
fsb.hr 13
cam.ac.uk 1
researchgate.net 1
utwente.nl 1
wikki.co.uk 1


------------

#### Last percentile stats (freq >=12.52)

Ratio 0.7647058823529411

#### 10 random examples

https://orcid.org/0000-0001-7549-8972
https://orcid.org/0000-0003-2207-9162
https://orcid.org/0000-0001-7549-8972
https://orcid.org/0000-0002-6487-4749
https://orcid.org/0000-0003-2063-2294
https://orcid.org/0000-0002-8170-5787
https://orcid.org/0000-0001-7353-0537
https://orcid.org/0000-0002-8170-5787
https://orcid.org/0000-0002-8596-1972
https://orcid.org/0000-0003-4481-8288


In [19]:
output_cluster(clusters, 1000, exp, excluded_domains)

### Cluster 1000 - Illegit

Cluster cardinality: 12

------------

#### Domains summary

 orcid
url_domains 
vic-casino.com 12


------------

#### Last percentile stats (freq >=12.0)

Ratio 1.0

#### 10 random examples

https://orcid.org/0000-0003-1324-6811
https://orcid.org/0000-0003-4404-4542
https://orcid.org/0000-0003-4637-4668
https://orcid.org/0000-0002-0284-3453
https://orcid.org/0000-0002-7302-4623
https://orcid.org/0000-0002-0095-7211
https://orcid.org/0000-0002-5553-4048
https://orcid.org/0000-0002-9113-3372
https://orcid.org/0000-0003-2003-7217
https://orcid.org/0000-0001-5583-8415


In [20]:
total = 0
for i in range(0, len(clusters)):
 r = get_ratio(clusters, i, exp)
 if r > 0.85:
 total += len(clusters[i])
print(total)

64748


In [21]:
total_more_than_10 = 0
for i in range(0, len(clusters)):
 if len(clusters[i]) > 10:
 r = get_ratio(clusters, i, exp)
 if r > 0.85:
 total_more_than_10 += len(clusters[i])
print(total_more_than_10)

45653
