{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pickle5 as pickle\n", "import numpy as np\n", "from IPython.display import display, Markdown\n", "import pandas as pd\n", "import tldextract" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_ratio(cluster, n, exp):\n", " records = exp[exp['orcid'].isin(cluster[n])]\n", " grouped = records.groupby('url_domains').count().sort_values('orcid', ascending=False)\n", " last_percentile = grouped['orcid'].quantile(0.99)\n", " \n", " last_grouped = grouped[grouped['orcid'] >= last_percentile]\n", " ratio = last_grouped['orcid'].mean()/ len(records)\n", " return ratio" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def output_cluster(cluster, n, exp, excluded_domains):\n", " \n", " records = exp[exp['orcid'].isin(cluster[n])]\n", " \n", " grouped = records.groupby('url_domains').count().sort_values('orcid', ascending=False)\n", " # grouped = records[~records['url_domains'].str.contains('|'.join(legit_domains))].groupby('url_domains').count().sort_values('orcid', ascending=False)\n", " grouped['norm'] = (1+grouped['orcid'])/2 # (-1,1] -> (0,1]\n", " grouped['lognorm'] = np.log(grouped['norm'])\n", " \n", " half_percentile = grouped['orcid'].quantile(0.5)\n", " last_percentile = grouped['orcid'].quantile(0.99)\n", " \n", " last_grouped = grouped[grouped['orcid'] >= last_percentile]\n", " others_grouped = grouped[grouped['orcid'] < last_percentile]\n", " half_grouped = grouped[grouped['orcid'] <= half_percentile]\n", " \n", " \n", " ratio = last_grouped['orcid'].mean()/ len(records)\n", " if ratio < 0.85:\n", " legit = True\n", " else:\n", " legit = False\n", " \n", " if legit:\n", " text = 'Legit'\n", " else:\n", " text = 'Illegit'\n", " display(Markdown('### Cluster ' + str(n) + ' - ' + text))\n", " display(Markdown('Cluster cardinality: ' + str(len(cluster[n]))))\n", " display(Markdown('------------'))\n", " display(Markdown('#### Domains summary'))\n", " print(grouped[['orcid']])\n", " \n", " display(Markdown('------------'))\n", " display(Markdown('#### Last percentile stats (freq >=' + str(last_percentile) + ')'))\n", " display(Markdown('Ratio ' + str(ratio)))\n", " \n", "# display(Markdown('------------'))\n", "# display(Markdown('#### Global stats'))\n", "# display(Markdown('Skewness ' + str(grouped['lognorm'].skew())))\n", "# display(Markdown('Kurtosis ' + str(grouped['lognorm'].kurt())))\n", "# display(Markdown('Variance ' + str(grouped['lognorm'].var())))\n", "# display(Markdown('Standard Deviation ' + str(grouped['lognorm'].std())))\n", "# display(Markdown('------------'))\n", " \n", " random_examples = records.sample(10)\n", " display(Markdown('#### 10 random examples'))\n", " for v in random_examples['orcid']:\n", " print('https://orcid.org/' + v)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def extract_domain(link):\n", " return tldextract.extract(link).registered_domain\n", "grid_df = pd.read_csv('data/grid/links.csv', index_col='grid_id')\n", "grid_df['domain'] = grid_df.link.apply(extract_domain)\n", "legit_domains = ['instagram.com', 'linkedin.com', 'twitter.com' 'youtube.com', 'researchgate.net', 'academia.edu', 'publons.com', 'github.io', 'github.com', 'scopus.com', 'researcherid.com', 'google', 'vub.be', '.ac.uk', 'goo.gl', 'elsevier.com', 'ssrn.com', 'elibrary.ru', 'google.co', '.gov', 'researchmap.jp', 'ucviden.dk']\n", "excluded_domains = list(grid_df['domain'].values) + legit_domains" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "clusters = pickle.load(open('data/clusters.obj', 'rb'))\n", "exp = pickle.load(open('data/exploded.obj', 'rb'))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 0 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 8364" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "researchgate.net 3409\n", "google.com 2113\n", "linkedin.com 1850\n", "publons.com 1491\n", "academia.edu 1371\n", "... ...\n", "ipsa.org 1\n", "iranpub.com 1\n", "ircm.fr 1\n", "ird.fr 1\n", "zvdd.de 1\n", "\n", "[4004 rows x 1 columns]\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=105.9399999999996)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.014687716828684146" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0002-7781-7161\n", "https://orcid.org/0000-0001-5646-1007\n", "https://orcid.org/0000-0002-8242-2295\n", "https://orcid.org/0000-0003-2450-090X\n", "https://orcid.org/0000-0003-0638-8555\n", "https://orcid.org/0000-0003-1785-9201\n", "https://orcid.org/0000-0001-6224-1430\n", "https://orcid.org/0000-0001-6451-7584\n", "https://orcid.org/0000-0002-2884-2504\n", "https://orcid.org/0000-0001-8253-9461\n" ] } ], "source": [ "output_cluster(clusters, 0, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 5 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 1147" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "bit.ly 839\n", "tinyurl.com 181\n", "researchgate.net 108\n", "linkedin.com 81\n", "seals.ac.za 77\n", "... ...\n", "handle.net 1\n", "hamgardi.com 1\n", "griffith.edu.au 1\n", "gregnewkirk.com 1\n", "liverpool.ac.uk 1\n", "\n", "[532 rows x 1 columns]\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=57.89999999999941)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.08075539568345323" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0003-1300-8004\n", "https://orcid.org/0000-0002-8269-4098\n", "https://orcid.org/0000-0002-6689-4129\n", "https://orcid.org/0000-0001-8304-8656\n", "https://orcid.org/0000-0002-7273-8640\n", "https://orcid.org/0000-0001-8046-1613\n", "https://orcid.org/0000-0002-8116-9611\n", "https://orcid.org/0000-0002-2728-7019\n", "https://orcid.org/0000-0002-3318-3996\n", "https://orcid.org/0000-0002-8493-0402\n" ] } ], "source": [ "output_cluster(clusters, 5, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 10 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 768" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "10.10.70.176 448\n", "tut.fi 182\n", "vtt.fi 152\n", "linkedin.com 35\n", "researchgate.net 22\n", "ohio-state.edu 22\n", "tuni.fi 17\n", "helsinki.fi 8\n", "google.fi 6\n", "wordpress.com 4\n", "twitter.com 4\n", "google.com 4\n", "ramentor.com 3\n", "vttresearch.com 2\n", "hamk.fi 2\n", "mendeley.com 2\n", "github.com 2\n", "google.com.ua 1\n", "ufabc.edu.br 1\n", "sustainablehousingdesign.com 1\n", "trustnet.fi 1\n", "blogspot.it 1\n", "bio-complexity.com 1\n", "astrocytenet.org 1\n", "ucpori.fi 1\n", "uni-stuttgart.de 1\n", "sofiepelsmakers.com 1\n", "upv.es 1\n", "valentinalenarduzzi.it 1\n", "arxiv.org 1\n", "amichalas.com 1\n", "w2e.fi 1\n", "academia.edu 1\n", "spinunit.eu 1\n", "digitalhealthrevolution.fi 1\n", "github.io 1\n", "man.ac.uk 1\n", "immersafe-itn.eu 1\n", "ju.se 1\n", "kannisto.org 1\n", "ku.dk 1\n", "full-parallax-imaging.eu 1\n", "lobov.biz 1\n", "aane.in 1\n", "environmentaldesignpocketbook.com 1\n", "modulight.com 1\n", "nanocalibrate.eu 1\n", "nordicsustainablearchitecture.com 1\n", "ntnu.edu 1\n", "etrovub.be 1\n", "ponomarenko.info 1\n", "yli-kaakinen.fi 1\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=312.34000000000054)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.47157894736842104" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0003-0493-7434\n", "https://orcid.org/0000-0003-0062-8677\n", "https://orcid.org/0000-0002-7097-1983\n", "https://orcid.org/0000-0002-4587-6167\n", "https://orcid.org/0000-0002-9059-8047\n", "https://orcid.org/0000-0002-8158-556X\n", "https://orcid.org/0000-0002-7574-4835\n", "https://orcid.org/0000-0002-4958-8533\n", "https://orcid.org/0000-0002-7953-1036\n", "https://orcid.org/0000-0002-6508-065X\n" ] } ], "source": [ "output_cluster(clusters, 10, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 12 - Illegit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 683" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "bravesites.com 628\n", "4shared.com 57\n", "itsmyurls.com 3\n", "wordpress.com 3\n", "blogspot.com 2\n", "weebly.com 2\n", "jimdo.com 2\n", "jigsy.com 2\n", "issuu.com 1\n", "wixsite.com 1\n", "wattpad.com 1\n", "unblog.fr 1\n", "tumblr.com 1\n", "nesterheatingandair.com 1\n", "beep.com 1\n", "hubpages.com 1\n", "behance.net 1\n", "home.blog 1\n", "ask.fm 1\n", "exante-otzyvy.ru 1\n", "etoysreview.com 1\n", "doomby.com 1\n", "disqus.com 1\n", "digg.com 1\n", "dailymotion.com 1\n", "chatempleteas.com 1\n", "google.com 1\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=479.5399999999991)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.8746518105849582" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0002-3418-013X\n", "https://orcid.org/0000-0001-8597-176X\n", "https://orcid.org/0000-0003-1977-6397\n", "https://orcid.org/0000-0003-2942-4096\n", "https://orcid.org/0000-0002-0561-4541\n", "https://orcid.org/0000-0002-3891-5025\n", "https://orcid.org/0000-0001-9898-4797\n", "https://orcid.org/0000-0002-2805-556X\n", "https://orcid.org/0000-0002-0591-8203\n", "https://orcid.org/0000-0003-3544-5141\n" ] } ], "source": [ "output_cluster(clusters, 12, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 15 - Illegit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 655" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "pbase.com 655\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=655.0)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 1.0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0003-0526-0031\n", "https://orcid.org/0000-0003-2459-0191\n", "https://orcid.org/0000-0002-3503-1216\n", "https://orcid.org/0000-0002-2448-3460\n", "https://orcid.org/0000-0002-3749-9488\n", "https://orcid.org/0000-0002-8148-1664\n", "https://orcid.org/0000-0003-3531-2226\n", "https://orcid.org/0000-0002-0740-8047\n", "https://orcid.org/0000-0002-8080-8952\n", "https://orcid.org/0000-0002-1595-5649\n" ] } ], "source": [ "output_cluster(clusters, 15, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 25 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 602" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "elsevierpure.com 567\n", "tue.nl 32\n", "linkedin.com 30\n", "sbg.ac.at 28\n", "researchgate.net 20\n", "... ...\n", "hethongdien.info 1\n", "hannahaugustin.at 1\n", "growingupyolngu.com.au 1\n", "graphicjustice.org 1\n", "mathmods.eu 1\n", "\n", "[129 rows x 1 columns]\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=31.439999999999998)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.332039911308204" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0002-1038-3530\n", "https://orcid.org/0000-0002-8942-9510\n", "https://orcid.org/0000-0002-2518-6847\n", "https://orcid.org/0000-0002-6238-8042\n", "https://orcid.org/0000-0002-4378-8059\n", "https://orcid.org/0000-0001-9266-0126\n", "https://orcid.org/0000-0002-2241-7953\n", "https://orcid.org/0000-0002-8700-2767\n", "https://orcid.org/0000-0002-1525-4615\n", "https://orcid.org/0000-0002-5508-1130\n" ] } ], "source": [ "output_cluster(clusters, 25, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 35 - Illegit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 548" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "lucialpiazzale.com 548\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=548.0)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 1.0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0002-0302-8274\n", "https://orcid.org/0000-0003-3462-9264\n", "https://orcid.org/0000-0002-4155-518X\n", "https://orcid.org/0000-0003-0801-5541\n", "https://orcid.org/0000-0002-2022-5675\n", "https://orcid.org/0000-0002-3068-5640\n", "https://orcid.org/0000-0001-6164-449X\n", "https://orcid.org/0000-0002-7691-3426\n", "https://orcid.org/0000-0002-6459-0754\n", "https://orcid.org/0000-0001-6723-0407\n" ] } ], "source": [ "output_cluster(clusters, 35, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/markdown": [ "### Cluster 70 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 252" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "blogspot.co.uk 117\n", "nature.com 77\n", "twitter.com 22\n", "mypartnerforever.com 19\n", "angelfire.com 17\n", "... ...\n", "google.com.co 1\n", "guillermito2.net 1\n", "hawaii.edu 1\n", "heacademy.ac.uk 1\n", "york.ac.uk 1\n", "\n", "[183 rows x 1 columns]\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=31.900000000000375)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.16275167785234898" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0003-2448-9464\n", "https://orcid.org/0000-0002-7154-7658\n", "https://orcid.org/0000-0003-0984-9580\n", "https://orcid.org/0000-0002-6792-7148\n", "https://orcid.org/0000-0002-9845-6633\n", "https://orcid.org/0000-0001-6845-7232\n", "https://orcid.org/0000-0001-5400-2712\n", "https://orcid.org/0000-0002-5765-9856\n", "https://orcid.org/0000-0002-9394-5702\n", "https://orcid.org/0000-0001-6995-0491\n" ] } ], "source": [ "output_cluster(clusters, 70, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 250 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 44" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "theses.fr 31\n", "archives-ouvertes.fr 9\n", "academia.edu 7\n", "emse.fr 7\n", "linkedin.com 6\n", "cairn.info 5\n", "adum.fr 5\n", "researchgate.net 3\n", "hypotheses.org 2\n", "twitter.com 2\n", "mines-stetienne.fr 2\n", "yannziegler.com 2\n", "google.fr 2\n", "casadevelazquez.org 2\n", "romanistik.de 1\n", "univ-evry.fr 1\n", "u-paris.fr 1\n", "u-bordeaux.fr 1\n", "univ-montp3.fr 1\n", "univ-reunion.fr 1\n", "univ-valenciennes.fr 1\n", "univ-lille.fr 1\n", "cnrs.fr 1\n", "revue-etr.org 1\n", "publons.com 1\n", "crec-paris3.fr 1\n", "ksu.edu.sa 1\n", "iptheologie.fr 1\n", "inra.fr 1\n", "iie.kz 1\n", "google.com 1\n", "google.co.in 1\n", "gonthier-leguen.fr 1\n", "mom.fr 1\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=23.740000000000038)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.29523809523809524" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0003-1402-4356\n", "https://orcid.org/0000-0002-4162-4288\n", "https://orcid.org/0000-0002-4162-4288\n", "https://orcid.org/0000-0003-0174-4442\n", "https://orcid.org/0000-0002-2959-3984\n", "https://orcid.org/0000-0003-0750-9881\n", "https://orcid.org/0000-0002-9057-4045\n", "https://orcid.org/0000-0002-2471-7028\n", "https://orcid.org/0000-0001-9785-6697\n", "https://orcid.org/0000-0003-0174-4442\n" ] } ], "source": [ "output_cluster(clusters, 250, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 315 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 37" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "um.edu.mo 37\n", "academia.edu 2\n", "google.com 2\n", "google.pt 1\n", "hongcaizhang.com 1\n", "researchgate.net 1\n", "xhsysu.edu.cn 1\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=34.89999999999998)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.8222222222222222" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0002-1201-9564\n", "https://orcid.org/0000-0003-2762-7543\n", "https://orcid.org/0000-0003-4801-6354\n", "https://orcid.org/0000-0002-6300-1575\n", "https://orcid.org/0000-0003-1723-1748\n", "https://orcid.org/0000-0003-3659-1917\n", "https://orcid.org/0000-0003-0832-0263\n", "https://orcid.org/0000-0002-8294-6419\n", "https://orcid.org/0000-0001-9403-7346\n", "https://orcid.org/0000-0001-5449-063X\n" ] } ], "source": [ "output_cluster(clusters, 315, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 400 - Illegit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 30" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "symulatorypc.pl 30\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=30.0)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 1.0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0001-7264-4006\n", "https://orcid.org/0000-0001-8048-0654\n", "https://orcid.org/0000-0003-1505-2172\n", "https://orcid.org/0000-0002-2800-9168\n", "https://orcid.org/0000-0001-5894-8644\n", "https://orcid.org/0000-0002-0454-8765\n", "https://orcid.org/0000-0001-7836-227X\n", "https://orcid.org/0000-0002-2492-5767\n", "https://orcid.org/0000-0003-2223-0707\n", "https://orcid.org/0000-0002-1379-3578\n" ] } ], "source": [ "output_cluster(clusters, 400, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 500 - Illegit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 24" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "haodf.com 24\n", "linkedin.com 1\n", "researchgate.net 1\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=23.54)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.9230769230769231" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0003-3967-1269\n", "https://orcid.org/0000-0002-0272-493X\n", "https://orcid.org/0000-0001-8459-742X\n", "https://orcid.org/0000-0003-1952-6734\n", "https://orcid.org/0000-0002-6959-5351\n", "https://orcid.org/0000-0002-1260-4261\n", "https://orcid.org/0000-0002-0348-9732\n", "https://orcid.org/0000-0002-7528-3274\n", "https://orcid.org/0000-0001-7493-607X\n", "https://orcid.org/0000-0002-7865-0359\n" ] } ], "source": [ "output_cluster(clusters, 500, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 600 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 19" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "nki.nl 19\n", "linkedin.com 4\n", "acsitefactory.com 1\n", "cancerresearchuk.org 1\n", "clinicaltrials.gov 1\n", "dcisprecision.org 1\n", "google.co.uk 1\n", "google.com 1\n", "researchgate.net 1\n", "tushartomar.com 1\n", "twitter.com 1\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=17.500000000000007)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.59375" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0002-8940-2676\n", "https://orcid.org/0000-0002-3344-7605\n", "https://orcid.org/0000-0001-5265-3407\n", "https://orcid.org/0000-0002-8940-2676\n", "https://orcid.org/0000-0001-6514-4767\n", "https://orcid.org/0000-0003-1743-6428\n", "https://orcid.org/0000-0001-6180-0632\n", "https://orcid.org/0000-0003-1743-6428\n", "https://orcid.org/0000-0002-3344-7605\n", "https://orcid.org/0000-0001-5989-289X\n" ] } ], "source": [ "output_cluster(clusters, 600, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 900 - Legit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 13" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "fsb.hr 13\n", "cam.ac.uk 1\n", "researchgate.net 1\n", "utwente.nl 1\n", "wikki.co.uk 1\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=12.52)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 0.7647058823529411" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0001-7549-8972\n", "https://orcid.org/0000-0003-2207-9162\n", "https://orcid.org/0000-0001-7549-8972\n", "https://orcid.org/0000-0002-6487-4749\n", "https://orcid.org/0000-0003-2063-2294\n", "https://orcid.org/0000-0002-8170-5787\n", "https://orcid.org/0000-0001-7353-0537\n", "https://orcid.org/0000-0002-8170-5787\n", "https://orcid.org/0000-0002-8596-1972\n", "https://orcid.org/0000-0003-4481-8288\n" ] } ], "source": [ "output_cluster(clusters, 900, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "### Cluster 1000 - Illegit" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Cluster cardinality: 12" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Domains summary" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ " orcid\n", "url_domains \n", "vic-casino.com 12\n" ] }, { "data": { "text/markdown": [ "------------" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### Last percentile stats (freq >=12.0)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "Ratio 1.0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/markdown": [ "#### 10 random examples" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "https://orcid.org/0000-0003-1324-6811\n", "https://orcid.org/0000-0003-4404-4542\n", "https://orcid.org/0000-0003-4637-4668\n", "https://orcid.org/0000-0002-0284-3453\n", "https://orcid.org/0000-0002-7302-4623\n", "https://orcid.org/0000-0002-0095-7211\n", "https://orcid.org/0000-0002-5553-4048\n", "https://orcid.org/0000-0002-9113-3372\n", "https://orcid.org/0000-0003-2003-7217\n", "https://orcid.org/0000-0001-5583-8415\n" ] } ], "source": [ "output_cluster(clusters, 1000, exp, excluded_domains)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "64748\n" ] } ], "source": [ "total = 0\n", "for i in range(0, len(clusters)):\n", " r = get_ratio(clusters, i, exp)\n", " if r > 0.85:\n", " total += len(clusters[i])\n", "print(total)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "45653\n" ] } ], "source": [ "total_more_than_10 = 0\n", "for i in range(0, len(clusters)):\n", " if len(clusters[i]) > 10:\n", " r = get_ratio(clusters, i, exp)\n", " if r > 0.85:\n", " total_more_than_10 += len(clusters[i])\n", "print(total_more_than_10)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }