starting to analyse overlap

This commit is contained in:
Andrea Mannocci 2021-07-26 11:15:14 +02:00
parent dd6b79e69f
commit 7ab83cbb10
7 changed files with 2544 additions and 8584 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -24,16 +24,10 @@
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",

View File

@ -9,16 +9,10 @@
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",

View File

@ -9,16 +9,10 @@
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
@ -999,12 +993,12 @@
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>disk0/00/00/12/03</td>\n",
" <td>disk0/00/00/14/07</td>\n",
" <td>2010-01-06 13:43:48</td>\n",
" <td>2011-07-06 08:24:53</td>\n",
" <td>2010-01-06 13:43:48</td>\n",
" <td>institutional</td>\n",
" <td>10164</td>\n",
" <td>12637</td>\n",
" <td>NaN</td>\n",
" <td>show</td>\n",
" <td>NaN</td>\n",
@ -1026,9 +1020,9 @@
" <td>http://eprints.upnjatim.ac.id/</td>\n",
" <td>Repositorio Institucional</td>\n",
" <td>http://virtuelcampus.univ-msila.dz/fll</td>\n",
" <td>http://repositorio.autonoma.edu.co/jspui/</td>\n",
" <td>http://npl.csircentral.net/</td>\n",
" <td>http://eprints.upnjatim.ac.id/cgi/latest_tool?...</td>\n",
" <td>http://cdigital.uv.mx/feed/rss_1.0/site</td>\n",
" <td>https://twitter.com/rpsicomdp?lang=es</td>\n",
" <td>info:other:archives.eprints.org:import</td>\n",
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
@ -1044,7 +1038,7 @@
" <td>other</td>\n",
" <td>L1</td>\n",
" <td>2006-05-04 10:48:14</td>\n",
" <td>CSIR-The National Physical Laboratory(NPL) is ...</td>\n",
" <td>¿Quién puede depositar documentos en el reposi...</td>\n",
" <td>This repository is hosted by the Texas Digital...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
@ -1060,9 +1054,9 @@
" <td>celestial</td>\n",
" <td>opendoar</td>\n",
" <td>2021-01-25</td>\n",
" <td>16</td>\n",
" <td>65</td>\n",
" <td>211</td>\n",
" <td>367</td>\n",
" <td>738</td>\n",
" <td>362</td>\n",
" <td>824</td>\n",
" <td>806</td>\n",
" <td>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...</td>\n",
@ -1161,13 +1155,13 @@
" dir datestamp lastmod \\\n",
"count 5375 5375 5375 \n",
"unique 5375 4127 3966 \n",
"top disk0/00/00/12/03 2010-01-06 13:43:48 2011-07-06 08:24:53 \n",
"top disk0/00/00/14/07 2010-01-06 13:43:48 2011-07-06 08:24:53 \n",
"freq 1 16 8 \n",
"\n",
" status_changed type succeeds commentary \\\n",
"count 5375 5375 107 0 \n",
"unique 4158 12 107 0 \n",
"top 2010-01-06 13:43:48 institutional 10164 NaN \n",
"top 2010-01-06 13:43:48 institutional 12637 NaN \n",
"freq 16 3795 1 NaN \n",
"\n",
" metadata_visibility latitude longitude relation_type relation_uri \\\n",
@ -1215,7 +1209,7 @@
" sword_endpoint \\\n",
"count 176 \n",
"unique 170 \n",
"top http://repositorio.autonoma.edu.co/jspui/ \n",
"top http://npl.csircentral.net/ \n",
"freq 2 \n",
"\n",
" rss_feed \\\n",
@ -1227,7 +1221,7 @@
" twitter_feed \\\n",
"count 115 \n",
"unique 111 \n",
"top http://cdigital.uv.mx/feed/rss_1.0/site \n",
"top https://twitter.com/rpsicomdp?lang=es \n",
"freq 2 \n",
"\n",
" description fulltext open_access mandate \\\n",
@ -1257,7 +1251,7 @@
" note \\\n",
"count 215 \n",
"unique 207 \n",
"top CSIR-The National Physical Laboratory(NPL) is ... \n",
"top ¿Quién puede depositar documentos en el reposi... \n",
"freq 2 \n",
"\n",
" suggestions activity_low \\\n",
@ -1293,7 +1287,7 @@
" webometrics_rank webometrics_size webometrics_visibility \\\n",
"count 148 148 148 \n",
"unique 148 148 148 \n",
"top 16 65 211 \n",
"top 367 738 362 \n",
"freq 1 1 1 \n",
"\n",
" webometrics_rich_files webometrics_scholar \\\n",
@ -2407,7 +2401,7 @@
" <td>{nan, 1}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{disk0/00/00/00/01, nan}</td>\n",
" <td>{nan, disk0/00/00/00/01}</td>\n",
" <td>{nan, 2010-01-06 13:43:48}</td>\n",
" <td>{nan, 2011-07-18 05:40:07}</td>\n",
" <td>{nan, 2010-01-06 13:43:48}</td>\n",
@ -2432,7 +2426,7 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, http://archivesic.ccsd.cnrs.fr/}</td>\n",
" <td>{nan, @RCHIVESIC }</td>\n",
" <td>{@RCHIVESIC , nan}</td>\n",
" <td>{nan, http://archivesic.ccsd.cnrs.fr/oai/oai.php}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2448,7 +2442,7 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, hal}</td>\n",
" <td>{nan, geoname_2_FR}</td>\n",
" <td>{geoname_2_FR, nan}</td>\n",
" <td>{nan, other}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, 2002-05-17 19:24:41}</td>\n",
@ -2458,12 +2452,12 @@
" <td>{nan, 0}</td>\n",
" <td>{nan, 0}</td>\n",
" <td>{nan, 25}</td>\n",
" <td>{nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...</td>\n",
" <td>{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{opendoar, celestial}</td>\n",
" <td>{celestial, opendoar}</td>\n",
" <td>{58, 669}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2486,7 +2480,7 @@
" <td>{nan}</td>\n",
" <td>{nan, disk0/00/00/00/10}</td>\n",
" <td>{nan, 2010-01-06 13:43:48}</td>\n",
" <td>{2011-07-18 05:40:13, nan}</td>\n",
" <td>{nan, 2011-07-18 05:40:13}</td>\n",
" <td>{nan, 2010-01-06 13:43:48}</td>\n",
" <td>{nan, institutional}</td>\n",
" <td>{nan}</td>\n",
@ -2510,20 +2504,20 @@
" <td>{nan}</td>\n",
" <td>{nan, http://www.diva-portal.org/mdh/}</td>\n",
" <td>{nan, Academic Archive On-line (Mälardalen Uni...</td>\n",
" <td>{http://www.diva-portal.org/oai/mdh/OAI, nan}</td>\n",
" <td>{nan, http://www.diva-portal.org/oai/mdh/OAI}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, se}</td>\n",
" <td>{nan, Uppsala}</td>\n",
" <td>{nan, 59.8667}</td>\n",
" <td>{17.6333, nan}</td>\n",
" <td>{nan, 17.6333}</td>\n",
" <td>{nan, diva}</td>\n",
" <td>{nan, geoname_2_SE}</td>\n",
" <td>{nan, other}</td>\n",
@ -2540,8 +2534,8 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{opendoar, celestial}</td>\n",
" <td>{526, 258}</td>\n",
" <td>{celestial, opendoar}</td>\n",
" <td>{258, 526}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2662,18 +2656,18 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, http://edoc.sub.uni-hamburg.de/klimawand...</td>\n",
" <td>{http://edoc.sub.uni-hamburg.de/klimawandel/, ...</td>\n",
" <td>{nan, Klimawandel Dokumentenserver}</td>\n",
" <td>{nan, http://edoc.sub.uni-hamburg.de/klimawand...</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, The \"Documentenserver Klimawandel\" (Repo...</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{nan, KLIMZUG projects, Climate Service Center...</td>\n",
" <td>{nan, http://www.climateservicecenter.de/, htt...</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan, KLIMZUG projects, Helmholtz-Zentrum Gees...</td>\n",
" <td>{http://www.climateservicecenter.de/, nan, htt...</td>\n",
" <td>{nan, de}</td>\n",
" <td>{nan, Hamburg}</td>\n",
" <td>{nan, 53.5511}</td>\n",
@ -2681,7 +2675,7 @@
" <td>{nan, opus}</td>\n",
" <td>{nan, geoname_2_DE}</td>\n",
" <td>{nan, other}</td>\n",
" <td>{G1, HD, GE, GF, S1}</td>\n",
" <td>{S1, HD, GF, GE, G1}</td>\n",
" <td>{nan, 2015-07-02 08:08:31}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2694,8 +2688,8 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, opendoar, celestial}</td>\n",
" <td>{nan, 3408, 5881}</td>\n",
" <td>{nan, celestial, opendoar}</td>\n",
" <td>{3408, 5881, nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2800,7 +2794,7 @@
"\n",
" dir datestamp \\\n",
"eprintid \n",
"1 {disk0/00/00/00/01, nan} {nan, 2010-01-06 13:43:48} \n",
"1 {nan, disk0/00/00/00/01} {nan, 2010-01-06 13:43:48} \n",
"10 {nan, disk0/00/00/00/10} {nan, 2010-01-06 13:43:48} \n",
"1000 {disk0/00/00/10/00} {2010-01-06 13:45:01} \n",
"10001 {nan, disk0/00/01/00/01} {nan, 2015-08-08 14:52:11} \n",
@ -2809,7 +2803,7 @@
" lastmod status_changed \\\n",
"eprintid \n",
"1 {nan, 2011-07-18 05:40:07} {nan, 2010-01-06 13:43:48} \n",
"10 {2011-07-18 05:40:13, nan} {nan, 2010-01-06 13:43:48} \n",
"10 {nan, 2011-07-18 05:40:13} {nan, 2010-01-06 13:43:48} \n",
"1000 {2011-07-06 08:21:21} {2010-01-06 13:45:01} \n",
"10001 {nan, 2016-03-21 19:44:01} {nan, 2015-08-08 14:52:11} \n",
"10008 {2016-03-21 19:43:51} {2015-08-08 14:52:26} \n",
@ -2859,12 +2853,12 @@
"1 {nan} {nan, http://archivesic.ccsd.cnrs.fr/} \n",
"10 {nan} {nan, http://www.diva-portal.org/mdh/} \n",
"1000 {nan} {http://pam.pisharp.org/} \n",
"10001 {nan} {nan, http://edoc.sub.uni-hamburg.de/klimawand... \n",
"10001 {nan} {http://edoc.sub.uni-hamburg.de/klimawandel/, ... \n",
"10008 {nan} {http://creativematter.skidmore.edu/} \n",
"\n",
" title \\\n",
"eprintid \n",
"1 {nan, @RCHIVESIC } \n",
"1 {@RCHIVESIC , nan} \n",
"10 {nan, Academic Archive On-line (Mälardalen Uni... \n",
"1000 {PAM - Portuguese Archive of Mathematics} \n",
"10001 {nan, Klimawandel Dokumentenserver} \n",
@ -2873,7 +2867,7 @@
" oai_pmh sword_endpoint \\\n",
"eprintid \n",
"1 {nan, http://archivesic.ccsd.cnrs.fr/oai/oai.php} {nan} \n",
"10 {http://www.diva-portal.org/oai/mdh/OAI, nan} {nan} \n",
"10 {nan, http://www.diva-portal.org/oai/mdh/OAI} {nan} \n",
"1000 {nan} {nan} \n",
"10001 {nan, http://edoc.sub.uni-hamburg.de/klimawand... {nan} \n",
"10008 {http://creativematter.skidmore.edu/do/oai/} {nan} \n",
@ -2889,17 +2883,17 @@
" description fulltext \\\n",
"eprintid \n",
"1 {nan} {nan} \n",
"10 {nan} {TRUE, nan} \n",
"10 {nan} {nan, TRUE} \n",
"1000 {nan} {TRUE} \n",
"10001 {nan, The \"Documentenserver Klimawandel\" (Repo... {TRUE, nan} \n",
"10001 {nan, The \"Documentenserver Klimawandel\" (Repo... {nan, TRUE} \n",
"10008 {Welcome to Creative Matter, a repository for ... {TRUE} \n",
"\n",
" open_access mandate \\\n",
"eprintid \n",
"1 {nan} {nan} \n",
"10 {TRUE, nan} {nan} \n",
"10 {nan, TRUE} {nan} \n",
"1000 {TRUE} {nan} \n",
"10001 {TRUE, nan} {TRUE, nan} \n",
"10001 {nan, TRUE} {nan, TRUE} \n",
"10008 {FALSE} {FALSE} \n",
"\n",
" organisation_title \\\n",
@ -2907,7 +2901,7 @@
"1 {nan} \n",
"10 {nan} \n",
"1000 {nan} \n",
"10001 {nan, KLIMZUG projects, Climate Service Center... \n",
"10001 {nan, KLIMZUG projects, Helmholtz-Zentrum Gees... \n",
"10008 {Skidmore College} \n",
"\n",
" organisation_home_page location_country \\\n",
@ -2915,20 +2909,20 @@
"1 {nan} {nan, fr} \n",
"10 {nan} {nan, se} \n",
"1000 {nan} {pt} \n",
"10001 {nan, http://www.climateservicecenter.de/, htt... {nan, de} \n",
"10001 {http://www.climateservicecenter.de/, nan, htt... {nan, de} \n",
"10008 {http://www.skidmore.edu/} {us} \n",
"\n",
" location_city location_latitude location_longitude \\\n",
"eprintid \n",
"1 {nan} {nan} {nan} \n",
"10 {nan, Uppsala} {nan, 59.8667} {17.6333, nan} \n",
"10 {nan, Uppsala} {nan, 59.8667} {nan, 17.6333} \n",
"1000 {Bellevue, WA} {47.6034} {-122.155} \n",
"10001 {nan, Hamburg} {nan, 53.5511} {nan, 9.9937} \n",
"10008 {Saratoga Springs} {43.0961} {-73.7818} \n",
"\n",
" software geoname version \\\n",
"eprintid \n",
"1 {nan, hal} {nan, geoname_2_FR} {nan, other} \n",
"1 {nan, hal} {geoname_2_FR, nan} {nan, other} \n",
"10 {nan, diva} {nan, geoname_2_SE} {nan, other} \n",
"1000 {dspace} {geoname_2_PT} {other} \n",
"10001 {nan, opus} {nan, geoname_2_DE} {nan, other} \n",
@ -2939,7 +2933,7 @@
"1 {nan} {nan, 2002-05-17 19:24:41} {nan} {nan} \n",
"10 {nan} {nan, 2005-12-08 13:15:22} {nan} {nan} \n",
"1000 {nan} {2006-05-04 10:48:14} {nan} {nan} \n",
"10001 {G1, HD, GE, GF, S1} {nan, 2015-07-02 08:08:31} {nan} {nan} \n",
"10001 {S1, HD, GF, GE, G1} {nan, 2015-07-02 08:08:31} {nan} {nan} \n",
"10008 {nan} {2015-07-06 17:35:50} {nan} {nan} \n",
"\n",
" activity_low activity_medium activity_high recordcount \\\n",
@ -2952,7 +2946,7 @@
"\n",
" recordhistory fulltexts_total \\\n",
"eprintid \n",
"1 {nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... {nan} \n",
"1 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... {nan} \n",
"10 {nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,1... {nan} \n",
"1000 {nan} {nan} \n",
"10001 {nan} {nan} \n",
@ -2968,10 +2962,10 @@
"\n",
" registry_name registry_id submit_to \\\n",
"eprintid \n",
"1 {opendoar, celestial} {58, 669} {nan} \n",
"10 {opendoar, celestial} {526, 258} {nan} \n",
"1 {celestial, opendoar} {58, 669} {nan} \n",
"10 {celestial, opendoar} {258, 526} {nan} \n",
"1000 {nan} {nan} {nan} \n",
"10001 {nan, opendoar, celestial} {nan, 3408, 5881} {nan} \n",
"10001 {nan, celestial, opendoar} {3408, 5881, nan} {nan} \n",
"10008 {celestial} {5882} {nan} \n",
"\n",
" submitted_to_name submitted_to_done webometrics_rank \\\n",
@ -3011,7 +3005,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -3252,7 +3246,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[58, 669]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -3329,8 +3323,8 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[526, 258]</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[258, 526]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -3461,7 +3455,7 @@
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
" <td>[KLIMZUG projects, Climate Service Center 2.0,...</td>\n",
" <td>[KLIMZUG projects, Helmholtz-Zentrum Geesthach...</td>\n",
" <td>[http://www.climateservicecenter.de/, http://w...</td>\n",
" <td>de</td>\n",
" <td>Hamburg</td>\n",
@ -3470,7 +3464,7 @@
" <td>opus</td>\n",
" <td>geoname_2_DE</td>\n",
" <td>other</td>\n",
" <td>[G1, GF, HD, GE, S1]</td>\n",
" <td>[S1, GE, HD, GF, G1]</td>\n",
" <td>2015-07-02 08:08:31</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -3483,7 +3477,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[3408, 5881]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -3688,7 +3682,7 @@
"1 NaN \n",
"10 NaN \n",
"1000 NaN \n",
"10001 [KLIMZUG projects, Climate Service Center 2.0,... \n",
"10001 [KLIMZUG projects, Helmholtz-Zentrum Geesthach... \n",
"10008 Skidmore College \n",
"\n",
" organisation_home_page location_country \\\n",
@ -3712,7 +3706,7 @@
"1 geoname_2_FR other NaN 2002-05-17 19:24:41 \n",
"10 geoname_2_SE other NaN 2005-12-08 13:15:22 \n",
"1000 geoname_2_PT other NaN 2006-05-04 10:48:14 \n",
"10001 geoname_2_DE other [G1, GF, HD, GE, S1] 2015-07-02 08:08:31 \n",
"10001 geoname_2_DE other [S1, GE, HD, GF, G1] 2015-07-02 08:08:31 \n",
"10008 geoname_2_US other NaN 2015-07-06 17:35:50 \n",
"\n",
" note suggestions activity_low activity_medium activity_high \\\n",
@ -3741,10 +3735,10 @@
"\n",
" registry_name registry_id submit_to submitted_to_name \\\n",
"eprintid \n",
"1 [opendoar, celestial] [58, 669] NaN NaN \n",
"10 [opendoar, celestial] [526, 258] NaN NaN \n",
"1 [celestial, opendoar] [58, 669] NaN NaN \n",
"10 [celestial, opendoar] [258, 526] NaN NaN \n",
"1000 NaN NaN NaN NaN \n",
"10001 [opendoar, celestial] [3408, 5881] NaN NaN \n",
"10001 [celestial, opendoar] [3408, 5881] NaN NaN \n",
"10008 celestial 5882 NaN NaN \n",
"\n",
" submitted_to_done webometrics_rank webometrics_size \\\n",
@ -3772,7 +3766,7 @@
"10008 NaN NaN NaN "
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -3793,7 +3787,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -3959,7 +3953,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[opendoar, roarmap, celestial]</td>\n",
" <td>[celestial, roarmap, opendoar]</td>\n",
" <td>[69, 166, 1106]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -4027,7 +4021,7 @@
"4188 NaN NaN NaN \n",
"\n",
" registry_name registry_id submit_to \\\n",
"4188 [opendoar, roarmap, celestial] [69, 166, 1106] NaN \n",
"4188 [celestial, roarmap, opendoar] [69, 166, 1106] NaN \n",
"\n",
" submitted_to_name submitted_to_done webometrics_rank webometrics_size \\\n",
"4188 NaN NaN 1 6 \n",
@ -4042,7 +4036,7 @@
"4188 NaN "
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -4054,7 +4048,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 10,
"metadata": {},
"outputs": [
{
@ -4277,7 +4271,7 @@
" <td>2</td>\n",
" <td>2</td>\n",
" <td>3802</td>\n",
" <td>3772</td>\n",
" <td>3771</td>\n",
" <td>143</td>\n",
" <td>1861</td>\n",
" <td>2887</td>\n",
@ -4299,7 +4293,7 @@
" <td>134</td>\n",
" <td>117</td>\n",
" <td>7</td>\n",
" <td>4255</td>\n",
" <td>4257</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
@ -4314,18 +4308,18 @@
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>10202</td>\n",
" <td>1259</td>\n",
" <td>11</td>\n",
" <td>archive</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>disk0/00/01/46/73</td>\n",
" <td>disk0/00/00/14/07</td>\n",
" <td>2010-01-06 13:43:48</td>\n",
" <td>2011-07-06 08:24:53</td>\n",
" <td>2016-05-02 05:43:04</td>\n",
" <td>2010-01-06 13:43:48</td>\n",
" <td>institutional</td>\n",
" <td>10164</td>\n",
" <td>12637</td>\n",
" <td>NaN</td>\n",
" <td>show</td>\n",
" <td>NaN</td>\n",
@ -4347,7 +4341,7 @@
" <td>http://eprints.upnjatim.ac.id/</td>\n",
" <td>Repositorio Institucional</td>\n",
" <td>http://kce.docressources.info/ws/PMBWs_2</td>\n",
" <td>http://my.indexcopernicus.com/fredemoreno</td>\n",
" <td>http://npl.csircentral.net/</td>\n",
" <td>http://eprints.upnjatim.ac.id/cgi/latest_tool?...</td>\n",
" <td>http://twitter.com/bu_ufsc</td>\n",
" <td>info:other:archives.eprints.org:import</td>\n",
@ -4365,7 +4359,7 @@
" <td>other</td>\n",
" <td>K1</td>\n",
" <td>2006-05-04 10:48:14</td>\n",
" <td>CSIR-The National Physical Laboratory(NPL) is ...</td>\n",
" <td>DSpace@Işık is a growing collection of Işık Un...</td>\n",
" <td>This repository is hosted by the Texas Digital...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
@ -4376,14 +4370,14 @@
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>2479</td>\n",
" <td>[opendoar, roarmap, celestial]</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[2246, 1879]</td>\n",
" <td>[celestial, roarmap, opendoar]</td>\n",
" <td>opendoar</td>\n",
" <td>2021-01-25</td>\n",
" <td>551</td>\n",
" <td>274</td>\n",
" <td>211</td>\n",
" <td>367</td>\n",
" <td>738</td>\n",
" <td>668</td>\n",
" <td>824</td>\n",
" <td>806</td>\n",
" <td>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...</td>\n",
@ -5022,7 +5016,7 @@
" eprintid rev_number eprint_status userid importid source \\\n",
"count 5375 5375 5375 5375 0.0 0.0 \n",
"unique 5375 658 1 2135 NaN NaN \n",
"top 10202 11 archive 1 NaN NaN \n",
"top 1259 11 archive 1 NaN NaN \n",
"freq 1 332 5375 1333 NaN NaN \n",
"mean NaN NaN NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN NaN NaN \n",
@ -5035,7 +5029,7 @@
" dir datestamp lastmod \\\n",
"count 5375 5375 5375 \n",
"unique 5375 4127 3966 \n",
"top disk0/00/01/46/73 2010-01-06 13:43:48 2011-07-06 08:24:53 \n",
"top disk0/00/00/14/07 2010-01-06 13:43:48 2016-05-02 05:43:04 \n",
"freq 1 16 8 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@ -5048,7 +5042,7 @@
" status_changed type succeeds commentary \\\n",
"count 5375 5375 107 0.0 \n",
"unique 4158 12 107 NaN \n",
"top 2010-01-06 13:43:48 institutional 10164 NaN \n",
"top 2010-01-06 13:43:48 institutional 12637 NaN \n",
"freq 16 3795 1 NaN \n",
"mean NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN \n",
@ -5152,7 +5146,7 @@
" sword_endpoint \\\n",
"count 176 \n",
"unique 170 \n",
"top http://my.indexcopernicus.com/fredemoreno \n",
"top http://npl.csircentral.net/ \n",
"freq 2 \n",
"mean NaN \n",
"std NaN \n",
@ -5203,7 +5197,7 @@
"\n",
" organisation_home_page location_country location_city \\\n",
"count 4226 5080 3655 \n",
"unique 3772 143 1861 \n",
"unique 3771 143 1861 \n",
"top http://www.cas.cn/ us Lima \n",
"freq 9 886 69 \n",
"mean NaN NaN NaN \n",
@ -5243,7 +5237,7 @@
" note \\\n",
"count 215 \n",
"unique 207 \n",
"top CSIR-The National Physical Laboratory(NPL) is ... \n",
"top DSpace@Işık is a growing collection of Işık Un... \n",
"freq 2 \n",
"mean NaN \n",
"std NaN \n",
@ -5295,7 +5289,7 @@
" fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name \\\n",
"count 258 270 258 4603 \n",
"unique 118 134 117 7 \n",
"top 0 0 0 [opendoar, celestial] \n",
"top 0 0 0 [celestial, opendoar] \n",
"freq 114 113 114 2114 \n",
"mean NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN \n",
@ -5307,8 +5301,8 @@
"\n",
" registry_id submit_to submitted_to_name \\\n",
"count 4578 293 205 \n",
"unique 4255 7 1 \n",
"top 2479 [opendoar, roarmap, celestial] opendoar \n",
"unique 4257 7 1 \n",
"top [2246, 1879] [celestial, roarmap, opendoar] opendoar \n",
"freq 4 92 205 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@ -5321,7 +5315,7 @@
" submitted_to_done webometrics_rank webometrics_size \\\n",
"count 205 148 148 \n",
"unique 1 148 148 \n",
"top 2021-01-25 551 274 \n",
"top 2021-01-25 367 738 \n",
"freq 205 1 1 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@ -5334,7 +5328,7 @@
" webometrics_visibility webometrics_rich_files webometrics_scholar \\\n",
"count 148 148 148 \n",
"unique 148 146 143 \n",
"top 211 824 806 \n",
"top 668 824 806 \n",
"freq 1 3 5 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@ -5371,7 +5365,7 @@
"max NaN "
]
},
"execution_count": 12,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -5382,7 +5376,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -5431,7 +5425,7 @@
"dtype: int64"
]
},
"execution_count": 28,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -5442,7 +5436,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@ -5486,7 +5480,7 @@
"dtype: int64"
]
},
"execution_count": 29,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@ -5495,6 +5489,68 @@
"roar_df.isna().sum()[40:]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan, 'TRUE', 'FALSE'], dtype=object)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df.open_access.unique()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['subject', 'institutional', 'researchdata', 'theses', 'database',\n",
" 'other', 'journal', 'opendata', 'demonstration', 'multi',\n",
" 'learning', 'webobservatory'], dtype=object)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df.type.unique()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan, 'TRUE', 'FALSE'], dtype=object)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df.mandate.unique()"
]
},
{
"cell_type": "code",
"execution_count": null,

View File

@ -9,16 +9,10 @@
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
@ -222,10 +216,10 @@
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>GBIF France IPT - GBIF France</td>\n",
" <td>Brassica Information Portal</td>\n",
" <td>CGD</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.w...</td>\n",
" <td>http://www.ebi.ac.uk/merops/</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.e...</td>\n",
" <td>http://web.iodp.tamu.edu/LORE/</td>\n",
" <td>United States</td>\n",
" <td>Life Science</td>\n",
" </tr>\n",
@ -246,19 +240,19 @@
" full_name short_name \\\n",
"count 1752 1752 \n",
"unique 1752 1741 \n",
"top GBIF France IPT - GBIF France CGD \n",
"top Brassica Information Portal CGD \n",
"freq 1 3 \n",
"\n",
" fs_url \\\n",
"count 1752 \n",
"unique 1752 \n",
"top https://fairsharing.org/10.25504/FAIRsharing.w... \n",
"top https://fairsharing.org/10.25504/FAIRsharing.e... \n",
"freq 1 \n",
"\n",
" url countries subjects \n",
"count 1752 1749 1690 \n",
"unique 1752 178 834 \n",
"top http://www.ebi.ac.uk/merops/ United States Life Science \n",
"top http://web.iodp.tamu.edu/LORE/ United States Life Science \n",
"freq 1 588 367 "
]
},

2338
notebooks/03-overlap.ipynb Normal file

File diff suppressed because it is too large Load Diff