starting to analyse overlap

master
Andrea Mannocci 3 years ago
parent dd6b79e69f
commit 7ab83cbb10

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

@ -24,16 +24,10 @@
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",

@ -9,16 +9,10 @@
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",

@ -9,16 +9,10 @@
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
@ -999,12 +993,12 @@
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>disk0/00/00/12/03</td>\n",
" <td>disk0/00/00/14/07</td>\n",
" <td>2010-01-06 13:43:48</td>\n",
" <td>2011-07-06 08:24:53</td>\n",
" <td>2010-01-06 13:43:48</td>\n",
" <td>institutional</td>\n",
" <td>10164</td>\n",
" <td>12637</td>\n",
" <td>NaN</td>\n",
" <td>show</td>\n",
" <td>NaN</td>\n",
@ -1026,9 +1020,9 @@
" <td>http://eprints.upnjatim.ac.id/</td>\n",
" <td>Repositorio Institucional</td>\n",
" <td>http://virtuelcampus.univ-msila.dz/fll</td>\n",
" <td>http://repositorio.autonoma.edu.co/jspui/</td>\n",
" <td>http://npl.csircentral.net/</td>\n",
" <td>http://eprints.upnjatim.ac.id/cgi/latest_tool?...</td>\n",
" <td>http://cdigital.uv.mx/feed/rss_1.0/site</td>\n",
" <td>https://twitter.com/rpsicomdp?lang=es</td>\n",
" <td>info:other:archives.eprints.org:import</td>\n",
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
@ -1044,7 +1038,7 @@
" <td>other</td>\n",
" <td>L1</td>\n",
" <td>2006-05-04 10:48:14</td>\n",
" <td>CSIR-The National Physical Laboratory(NPL) is ...</td>\n",
" <td>¿Quién puede depositar documentos en el reposi...</td>\n",
" <td>This repository is hosted by the Texas Digital...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
@ -1060,9 +1054,9 @@
" <td>celestial</td>\n",
" <td>opendoar</td>\n",
" <td>2021-01-25</td>\n",
" <td>16</td>\n",
" <td>65</td>\n",
" <td>211</td>\n",
" <td>367</td>\n",
" <td>738</td>\n",
" <td>362</td>\n",
" <td>824</td>\n",
" <td>806</td>\n",
" <td>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...</td>\n",
@ -1161,13 +1155,13 @@
" dir datestamp lastmod \\\n",
"count 5375 5375 5375 \n",
"unique 5375 4127 3966 \n",
"top disk0/00/00/12/03 2010-01-06 13:43:48 2011-07-06 08:24:53 \n",
"top disk0/00/00/14/07 2010-01-06 13:43:48 2011-07-06 08:24:53 \n",
"freq 1 16 8 \n",
"\n",
" status_changed type succeeds commentary \\\n",
"count 5375 5375 107 0 \n",
"unique 4158 12 107 0 \n",
"top 2010-01-06 13:43:48 institutional 10164 NaN \n",
"top 2010-01-06 13:43:48 institutional 12637 NaN \n",
"freq 16 3795 1 NaN \n",
"\n",
" metadata_visibility latitude longitude relation_type relation_uri \\\n",
@ -1212,11 +1206,11 @@
"top Repositorio Institucional http://virtuelcampus.univ-msila.dz/fll \n",
"freq 7 5 \n",
"\n",
" sword_endpoint \\\n",
"count 176 \n",
"unique 170 \n",
"top http://repositorio.autonoma.edu.co/jspui/ \n",
"freq 2 \n",
" sword_endpoint \\\n",
"count 176 \n",
"unique 170 \n",
"top http://npl.csircentral.net/ \n",
"freq 2 \n",
"\n",
" rss_feed \\\n",
"count 1521 \n",
@ -1224,11 +1218,11 @@
"top http://eprints.upnjatim.ac.id/cgi/latest_tool?... \n",
"freq 5 \n",
"\n",
" twitter_feed \\\n",
"count 115 \n",
"unique 111 \n",
"top http://cdigital.uv.mx/feed/rss_1.0/site \n",
"freq 2 \n",
" twitter_feed \\\n",
"count 115 \n",
"unique 111 \n",
"top https://twitter.com/rpsicomdp?lang=es \n",
"freq 2 \n",
"\n",
" description fulltext open_access mandate \\\n",
"count 3782 4127 4127 3676 \n",
@ -1257,7 +1251,7 @@
" note \\\n",
"count 215 \n",
"unique 207 \n",
"top CSIR-The National Physical Laboratory(NPL) is ... \n",
"top ¿Quién puede depositar documentos en el reposi... \n",
"freq 2 \n",
"\n",
" suggestions activity_low \\\n",
@ -1293,7 +1287,7 @@
" webometrics_rank webometrics_size webometrics_visibility \\\n",
"count 148 148 148 \n",
"unique 148 148 148 \n",
"top 16 65 211 \n",
"top 367 738 362 \n",
"freq 1 1 1 \n",
"\n",
" webometrics_rich_files webometrics_scholar \\\n",
@ -2407,7 +2401,7 @@
" <td>{nan, 1}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{disk0/00/00/00/01, nan}</td>\n",
" <td>{nan, disk0/00/00/00/01}</td>\n",
" <td>{nan, 2010-01-06 13:43:48}</td>\n",
" <td>{nan, 2011-07-18 05:40:07}</td>\n",
" <td>{nan, 2010-01-06 13:43:48}</td>\n",
@ -2432,7 +2426,7 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, http://archivesic.ccsd.cnrs.fr/}</td>\n",
" <td>{nan, @RCHIVESIC }</td>\n",
" <td>{@RCHIVESIC , nan}</td>\n",
" <td>{nan, http://archivesic.ccsd.cnrs.fr/oai/oai.php}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2448,7 +2442,7 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, hal}</td>\n",
" <td>{nan, geoname_2_FR}</td>\n",
" <td>{geoname_2_FR, nan}</td>\n",
" <td>{nan, other}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, 2002-05-17 19:24:41}</td>\n",
@ -2458,12 +2452,12 @@
" <td>{nan, 0}</td>\n",
" <td>{nan, 0}</td>\n",
" <td>{nan, 25}</td>\n",
" <td>{nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...</td>\n",
" <td>{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{opendoar, celestial}</td>\n",
" <td>{celestial, opendoar}</td>\n",
" <td>{58, 669}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2486,7 +2480,7 @@
" <td>{nan}</td>\n",
" <td>{nan, disk0/00/00/00/10}</td>\n",
" <td>{nan, 2010-01-06 13:43:48}</td>\n",
" <td>{2011-07-18 05:40:13, nan}</td>\n",
" <td>{nan, 2011-07-18 05:40:13}</td>\n",
" <td>{nan, 2010-01-06 13:43:48}</td>\n",
" <td>{nan, institutional}</td>\n",
" <td>{nan}</td>\n",
@ -2510,20 +2504,20 @@
" <td>{nan}</td>\n",
" <td>{nan, http://www.diva-portal.org/mdh/}</td>\n",
" <td>{nan, Academic Archive On-line (Mälardalen Uni...</td>\n",
" <td>{http://www.diva-portal.org/oai/mdh/OAI, nan}</td>\n",
" <td>{nan, http://www.diva-portal.org/oai/mdh/OAI}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, se}</td>\n",
" <td>{nan, Uppsala}</td>\n",
" <td>{nan, 59.8667}</td>\n",
" <td>{17.6333, nan}</td>\n",
" <td>{nan, 17.6333}</td>\n",
" <td>{nan, diva}</td>\n",
" <td>{nan, geoname_2_SE}</td>\n",
" <td>{nan, other}</td>\n",
@ -2540,8 +2534,8 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{opendoar, celestial}</td>\n",
" <td>{526, 258}</td>\n",
" <td>{celestial, opendoar}</td>\n",
" <td>{258, 526}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2662,18 +2656,18 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, http://edoc.sub.uni-hamburg.de/klimawand...</td>\n",
" <td>{http://edoc.sub.uni-hamburg.de/klimawandel/, ...</td>\n",
" <td>{nan, Klimawandel Dokumentenserver}</td>\n",
" <td>{nan, http://edoc.sub.uni-hamburg.de/klimawand...</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, The \"Documentenserver Klimawandel\" (Repo...</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{TRUE, nan}</td>\n",
" <td>{nan, KLIMZUG projects, Climate Service Center...</td>\n",
" <td>{nan, http://www.climateservicecenter.de/, htt...</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan, TRUE}</td>\n",
" <td>{nan, KLIMZUG projects, Helmholtz-Zentrum Gees...</td>\n",
" <td>{http://www.climateservicecenter.de/, nan, htt...</td>\n",
" <td>{nan, de}</td>\n",
" <td>{nan, Hamburg}</td>\n",
" <td>{nan, 53.5511}</td>\n",
@ -2681,7 +2675,7 @@
" <td>{nan, opus}</td>\n",
" <td>{nan, geoname_2_DE}</td>\n",
" <td>{nan, other}</td>\n",
" <td>{G1, HD, GE, GF, S1}</td>\n",
" <td>{S1, HD, GF, GE, G1}</td>\n",
" <td>{nan, 2015-07-02 08:08:31}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2694,8 +2688,8 @@
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan, opendoar, celestial}</td>\n",
" <td>{nan, 3408, 5881}</td>\n",
" <td>{nan, celestial, opendoar}</td>\n",
" <td>{3408, 5881, nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
" <td>{nan}</td>\n",
@ -2800,7 +2794,7 @@
"\n",
" dir datestamp \\\n",
"eprintid \n",
"1 {disk0/00/00/00/01, nan} {nan, 2010-01-06 13:43:48} \n",
"1 {nan, disk0/00/00/00/01} {nan, 2010-01-06 13:43:48} \n",
"10 {nan, disk0/00/00/00/10} {nan, 2010-01-06 13:43:48} \n",
"1000 {disk0/00/00/10/00} {2010-01-06 13:45:01} \n",
"10001 {nan, disk0/00/01/00/01} {nan, 2015-08-08 14:52:11} \n",
@ -2809,7 +2803,7 @@
" lastmod status_changed \\\n",
"eprintid \n",
"1 {nan, 2011-07-18 05:40:07} {nan, 2010-01-06 13:43:48} \n",
"10 {2011-07-18 05:40:13, nan} {nan, 2010-01-06 13:43:48} \n",
"10 {nan, 2011-07-18 05:40:13} {nan, 2010-01-06 13:43:48} \n",
"1000 {2011-07-06 08:21:21} {2010-01-06 13:45:01} \n",
"10001 {nan, 2016-03-21 19:44:01} {nan, 2015-08-08 14:52:11} \n",
"10008 {2016-03-21 19:43:51} {2015-08-08 14:52:26} \n",
@ -2859,12 +2853,12 @@
"1 {nan} {nan, http://archivesic.ccsd.cnrs.fr/} \n",
"10 {nan} {nan, http://www.diva-portal.org/mdh/} \n",
"1000 {nan} {http://pam.pisharp.org/} \n",
"10001 {nan} {nan, http://edoc.sub.uni-hamburg.de/klimawand... \n",
"10001 {nan} {http://edoc.sub.uni-hamburg.de/klimawandel/, ... \n",
"10008 {nan} {http://creativematter.skidmore.edu/} \n",
"\n",
" title \\\n",
"eprintid \n",
"1 {nan, @RCHIVESIC } \n",
"1 {@RCHIVESIC , nan} \n",
"10 {nan, Academic Archive On-line (Mälardalen Uni... \n",
"1000 {PAM - Portuguese Archive of Mathematics} \n",
"10001 {nan, Klimawandel Dokumentenserver} \n",
@ -2873,7 +2867,7 @@
" oai_pmh sword_endpoint \\\n",
"eprintid \n",
"1 {nan, http://archivesic.ccsd.cnrs.fr/oai/oai.php} {nan} \n",
"10 {http://www.diva-portal.org/oai/mdh/OAI, nan} {nan} \n",
"10 {nan, http://www.diva-portal.org/oai/mdh/OAI} {nan} \n",
"1000 {nan} {nan} \n",
"10001 {nan, http://edoc.sub.uni-hamburg.de/klimawand... {nan} \n",
"10008 {http://creativematter.skidmore.edu/do/oai/} {nan} \n",
@ -2889,17 +2883,17 @@
" description fulltext \\\n",
"eprintid \n",
"1 {nan} {nan} \n",
"10 {nan} {TRUE, nan} \n",
"10 {nan} {nan, TRUE} \n",
"1000 {nan} {TRUE} \n",
"10001 {nan, The \"Documentenserver Klimawandel\" (Repo... {TRUE, nan} \n",
"10001 {nan, The \"Documentenserver Klimawandel\" (Repo... {nan, TRUE} \n",
"10008 {Welcome to Creative Matter, a repository for ... {TRUE} \n",
"\n",
" open_access mandate \\\n",
"eprintid \n",
"1 {nan} {nan} \n",
"10 {TRUE, nan} {nan} \n",
"10 {nan, TRUE} {nan} \n",
"1000 {TRUE} {nan} \n",
"10001 {TRUE, nan} {TRUE, nan} \n",
"10001 {nan, TRUE} {nan, TRUE} \n",
"10008 {FALSE} {FALSE} \n",
"\n",
" organisation_title \\\n",
@ -2907,7 +2901,7 @@
"1 {nan} \n",
"10 {nan} \n",
"1000 {nan} \n",
"10001 {nan, KLIMZUG projects, Climate Service Center... \n",
"10001 {nan, KLIMZUG projects, Helmholtz-Zentrum Gees... \n",
"10008 {Skidmore College} \n",
"\n",
" organisation_home_page location_country \\\n",
@ -2915,20 +2909,20 @@
"1 {nan} {nan, fr} \n",
"10 {nan} {nan, se} \n",
"1000 {nan} {pt} \n",
"10001 {nan, http://www.climateservicecenter.de/, htt... {nan, de} \n",
"10001 {http://www.climateservicecenter.de/, nan, htt... {nan, de} \n",
"10008 {http://www.skidmore.edu/} {us} \n",
"\n",
" location_city location_latitude location_longitude \\\n",
"eprintid \n",
"1 {nan} {nan} {nan} \n",
"10 {nan, Uppsala} {nan, 59.8667} {17.6333, nan} \n",
"10 {nan, Uppsala} {nan, 59.8667} {nan, 17.6333} \n",
"1000 {Bellevue, WA} {47.6034} {-122.155} \n",
"10001 {nan, Hamburg} {nan, 53.5511} {nan, 9.9937} \n",
"10008 {Saratoga Springs} {43.0961} {-73.7818} \n",
"\n",
" software geoname version \\\n",
"eprintid \n",
"1 {nan, hal} {nan, geoname_2_FR} {nan, other} \n",
"1 {nan, hal} {geoname_2_FR, nan} {nan, other} \n",
"10 {nan, diva} {nan, geoname_2_SE} {nan, other} \n",
"1000 {dspace} {geoname_2_PT} {other} \n",
"10001 {nan, opus} {nan, geoname_2_DE} {nan, other} \n",
@ -2939,7 +2933,7 @@
"1 {nan} {nan, 2002-05-17 19:24:41} {nan} {nan} \n",
"10 {nan} {nan, 2005-12-08 13:15:22} {nan} {nan} \n",
"1000 {nan} {2006-05-04 10:48:14} {nan} {nan} \n",
"10001 {G1, HD, GE, GF, S1} {nan, 2015-07-02 08:08:31} {nan} {nan} \n",
"10001 {S1, HD, GF, GE, G1} {nan, 2015-07-02 08:08:31} {nan} {nan} \n",
"10008 {nan} {2015-07-06 17:35:50} {nan} {nan} \n",
"\n",
" activity_low activity_medium activity_high recordcount \\\n",
@ -2952,7 +2946,7 @@
"\n",
" recordhistory fulltexts_total \\\n",
"eprintid \n",
"1 {nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... {nan} \n",
"1 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... {nan} \n",
"10 {nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,1... {nan} \n",
"1000 {nan} {nan} \n",
"10001 {nan} {nan} \n",
@ -2968,10 +2962,10 @@
"\n",
" registry_name registry_id submit_to \\\n",
"eprintid \n",
"1 {opendoar, celestial} {58, 669} {nan} \n",
"10 {opendoar, celestial} {526, 258} {nan} \n",
"1 {celestial, opendoar} {58, 669} {nan} \n",
"10 {celestial, opendoar} {258, 526} {nan} \n",
"1000 {nan} {nan} {nan} \n",
"10001 {nan, opendoar, celestial} {nan, 3408, 5881} {nan} \n",
"10001 {nan, celestial, opendoar} {3408, 5881, nan} {nan} \n",
"10008 {celestial} {5882} {nan} \n",
"\n",
" submitted_to_name submitted_to_done webometrics_rank \\\n",
@ -3011,7 +3005,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -3252,7 +3246,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[58, 669]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -3329,8 +3323,8 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[526, 258]</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[258, 526]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -3461,7 +3455,7 @@
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
" <td>[KLIMZUG projects, Climate Service Center 2.0,...</td>\n",
" <td>[KLIMZUG projects, Helmholtz-Zentrum Geesthach...</td>\n",
" <td>[http://www.climateservicecenter.de/, http://w...</td>\n",
" <td>de</td>\n",
" <td>Hamburg</td>\n",
@ -3470,7 +3464,7 @@
" <td>opus</td>\n",
" <td>geoname_2_DE</td>\n",
" <td>other</td>\n",
" <td>[G1, GF, HD, GE, S1]</td>\n",
" <td>[S1, GE, HD, GF, G1]</td>\n",
" <td>2015-07-02 08:08:31</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -3483,7 +3477,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[3408, 5881]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -3688,7 +3682,7 @@
"1 NaN \n",
"10 NaN \n",
"1000 NaN \n",
"10001 [KLIMZUG projects, Climate Service Center 2.0,... \n",
"10001 [KLIMZUG projects, Helmholtz-Zentrum Geesthach... \n",
"10008 Skidmore College \n",
"\n",
" organisation_home_page location_country \\\n",
@ -3712,7 +3706,7 @@
"1 geoname_2_FR other NaN 2002-05-17 19:24:41 \n",
"10 geoname_2_SE other NaN 2005-12-08 13:15:22 \n",
"1000 geoname_2_PT other NaN 2006-05-04 10:48:14 \n",
"10001 geoname_2_DE other [G1, GF, HD, GE, S1] 2015-07-02 08:08:31 \n",
"10001 geoname_2_DE other [S1, GE, HD, GF, G1] 2015-07-02 08:08:31 \n",
"10008 geoname_2_US other NaN 2015-07-06 17:35:50 \n",
"\n",
" note suggestions activity_low activity_medium activity_high \\\n",
@ -3741,10 +3735,10 @@
"\n",
" registry_name registry_id submit_to submitted_to_name \\\n",
"eprintid \n",
"1 [opendoar, celestial] [58, 669] NaN NaN \n",
"10 [opendoar, celestial] [526, 258] NaN NaN \n",
"1 [celestial, opendoar] [58, 669] NaN NaN \n",
"10 [celestial, opendoar] [258, 526] NaN NaN \n",
"1000 NaN NaN NaN NaN \n",
"10001 [opendoar, celestial] [3408, 5881] NaN NaN \n",
"10001 [celestial, opendoar] [3408, 5881] NaN NaN \n",
"10008 celestial 5882 NaN NaN \n",
"\n",
" submitted_to_done webometrics_rank webometrics_size \\\n",
@ -3772,7 +3766,7 @@
"10008 NaN NaN NaN "
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -3793,7 +3787,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -3959,7 +3953,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[opendoar, roarmap, celestial]</td>\n",
" <td>[celestial, roarmap, opendoar]</td>\n",
" <td>[69, 166, 1106]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -4027,7 +4021,7 @@
"4188 NaN NaN NaN \n",
"\n",
" registry_name registry_id submit_to \\\n",
"4188 [opendoar, roarmap, celestial] [69, 166, 1106] NaN \n",
"4188 [celestial, roarmap, opendoar] [69, 166, 1106] NaN \n",
"\n",
" submitted_to_name submitted_to_done webometrics_rank webometrics_size \\\n",
"4188 NaN NaN 1 6 \n",
@ -4042,7 +4036,7 @@
"4188 NaN "
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -4054,7 +4048,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 10,
"metadata": {},
"outputs": [
{
@ -4277,7 +4271,7 @@
" <td>2</td>\n",
" <td>2</td>\n",
" <td>3802</td>\n",
" <td>3772</td>\n",
" <td>3771</td>\n",
" <td>143</td>\n",
" <td>1861</td>\n",
" <td>2887</td>\n",
@ -4299,7 +4293,7 @@
" <td>134</td>\n",
" <td>117</td>\n",
" <td>7</td>\n",
" <td>4255</td>\n",
" <td>4257</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
@ -4314,18 +4308,18 @@
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>10202</td>\n",
" <td>1259</td>\n",
" <td>11</td>\n",
" <td>archive</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>disk0/00/01/46/73</td>\n",
" <td>disk0/00/00/14/07</td>\n",
" <td>2010-01-06 13:43:48</td>\n",
" <td>2011-07-06 08:24:53</td>\n",
" <td>2016-05-02 05:43:04</td>\n",
" <td>2010-01-06 13:43:48</td>\n",
" <td>institutional</td>\n",
" <td>10164</td>\n",
" <td>12637</td>\n",
" <td>NaN</td>\n",
" <td>show</td>\n",
" <td>NaN</td>\n",
@ -4347,7 +4341,7 @@
" <td>http://eprints.upnjatim.ac.id/</td>\n",
" <td>Repositorio Institucional</td>\n",
" <td>http://kce.docressources.info/ws/PMBWs_2</td>\n",
" <td>http://my.indexcopernicus.com/fredemoreno</td>\n",
" <td>http://npl.csircentral.net/</td>\n",
" <td>http://eprints.upnjatim.ac.id/cgi/latest_tool?...</td>\n",
" <td>http://twitter.com/bu_ufsc</td>\n",
" <td>info:other:archives.eprints.org:import</td>\n",
@ -4365,7 +4359,7 @@
" <td>other</td>\n",
" <td>K1</td>\n",
" <td>2006-05-04 10:48:14</td>\n",
" <td>CSIR-The National Physical Laboratory(NPL) is ...</td>\n",
" <td>DSpace@Işık is a growing collection of Işık Un...</td>\n",
" <td>This repository is hosted by the Texas Digital...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
@ -4376,14 +4370,14 @@
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>2479</td>\n",
" <td>[opendoar, roarmap, celestial]</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[2246, 1879]</td>\n",
" <td>[celestial, roarmap, opendoar]</td>\n",
" <td>opendoar</td>\n",
" <td>2021-01-25</td>\n",
" <td>551</td>\n",
" <td>274</td>\n",
" <td>211</td>\n",
" <td>367</td>\n",
" <td>738</td>\n",
" <td>668</td>\n",
" <td>824</td>\n",
" <td>806</td>\n",
" <td>0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...</td>\n",
@ -5022,7 +5016,7 @@
" eprintid rev_number eprint_status userid importid source \\\n",
"count 5375 5375 5375 5375 0.0 0.0 \n",
"unique 5375 658 1 2135 NaN NaN \n",
"top 10202 11 archive 1 NaN NaN \n",
"top 1259 11 archive 1 NaN NaN \n",
"freq 1 332 5375 1333 NaN NaN \n",
"mean NaN NaN NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN NaN NaN \n",
@ -5035,7 +5029,7 @@
" dir datestamp lastmod \\\n",
"count 5375 5375 5375 \n",
"unique 5375 4127 3966 \n",
"top disk0/00/01/46/73 2010-01-06 13:43:48 2011-07-06 08:24:53 \n",
"top disk0/00/00/14/07 2010-01-06 13:43:48 2016-05-02 05:43:04 \n",
"freq 1 16 8 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@ -5048,7 +5042,7 @@
" status_changed type succeeds commentary \\\n",
"count 5375 5375 107 0.0 \n",
"unique 4158 12 107 NaN \n",
"top 2010-01-06 13:43:48 institutional 10164 NaN \n",
"top 2010-01-06 13:43:48 institutional 12637 NaN \n",
"freq 16 3795 1 NaN \n",
"mean NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN \n",
@ -5149,18 +5143,18 @@
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" sword_endpoint \\\n",
"count 176 \n",
"unique 170 \n",
"top http://my.indexcopernicus.com/fredemoreno \n",
"freq 2 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
" sword_endpoint \\\n",
"count 176 \n",
"unique 170 \n",
"top http://npl.csircentral.net/ \n",
"freq 2 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" rss_feed \\\n",
"count 1521 \n",
@ -5203,7 +5197,7 @@
"\n",
" organisation_home_page location_country location_city \\\n",
"count 4226 5080 3655 \n",
"unique 3772 143 1861 \n",
"unique 3771 143 1861 \n",
"top http://www.cas.cn/ us Lima \n",
"freq 9 886 69 \n",
"mean NaN NaN NaN \n",
@ -5243,7 +5237,7 @@
" note \\\n",
"count 215 \n",
"unique 207 \n",
"top CSIR-The National Physical Laboratory(NPL) is ... \n",
"top DSpace@Işık is a growing collection of Işık Un... \n",
"freq 2 \n",
"mean NaN \n",
"std NaN \n",
@ -5295,7 +5289,7 @@
" fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name \\\n",
"count 258 270 258 4603 \n",
"unique 118 134 117 7 \n",
"top 0 0 0 [opendoar, celestial] \n",
"top 0 0 0 [celestial, opendoar] \n",
"freq 114 113 114 2114 \n",
"mean NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN \n",
@ -5305,23 +5299,23 @@
"75% NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN \n",
"\n",
" registry_id submit_to submitted_to_name \\\n",
"count 4578 293 205 \n",
"unique 4255 7 1 \n",
"top 2479 [opendoar, roarmap, celestial] opendoar \n",
"freq 4 92 205 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
"min NaN NaN NaN \n",
"25% NaN NaN NaN \n",
"50% NaN NaN NaN \n",
"75% NaN NaN NaN \n",
"max NaN NaN NaN \n",
" registry_id submit_to submitted_to_name \\\n",
"count 4578 293 205 \n",
"unique 4257 7 1 \n",
"top [2246, 1879] [celestial, roarmap, opendoar] opendoar \n",
"freq 4 92 205 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
"min NaN NaN NaN \n",
"25% NaN NaN NaN \n",
"50% NaN NaN NaN \n",
"75% NaN NaN NaN \n",
"max NaN NaN NaN \n",
"\n",
" submitted_to_done webometrics_rank webometrics_size \\\n",
"count 205 148 148 \n",
"unique 1 148 148 \n",
"top 2021-01-25 551 274 \n",
"top 2021-01-25 367 738 \n",
"freq 205 1 1 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@ -5334,7 +5328,7 @@
" webometrics_visibility webometrics_rich_files webometrics_scholar \\\n",
"count 148 148 148 \n",
"unique 148 146 143 \n",
"top 211 824 806 \n",
"top 668 824 806 \n",
"freq 1 3 5 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
@ -5371,7 +5365,7 @@
"max NaN "
]
},
"execution_count": 12,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -5382,7 +5376,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -5431,7 +5425,7 @@
"dtype: int64"
]
},
"execution_count": 28,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -5442,7 +5436,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@ -5486,7 +5480,7 @@
"dtype: int64"
]
},
"execution_count": 29,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@ -5495,6 +5489,68 @@
"roar_df.isna().sum()[40:]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan, 'TRUE', 'FALSE'], dtype=object)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df.open_access.unique()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['subject', 'institutional', 'researchdata', 'theses', 'database',\n",
" 'other', 'journal', 'opendata', 'demonstration', 'multi',\n",
" 'learning', 'webobservatory'], dtype=object)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df.type.unique()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan, 'TRUE', 'FALSE'], dtype=object)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df.mandate.unique()"
]
},
{
"cell_type": "code",
"execution_count": null,

@ -9,16 +9,10 @@
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
@ -222,10 +216,10 @@
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>GBIF France IPT - GBIF France</td>\n",
" <td>Brassica Information Portal</td>\n",
" <td>CGD</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.w...</td>\n",
" <td>http://www.ebi.ac.uk/merops/</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.e...</td>\n",
" <td>http://web.iodp.tamu.edu/LORE/</td>\n",
" <td>United States</td>\n",
" <td>Life Science</td>\n",
" </tr>\n",
@ -243,23 +237,23 @@
"</div>"
],
"text/plain": [
" full_name short_name \\\n",
"count 1752 1752 \n",
"unique 1752 1741 \n",
"top GBIF France IPT - GBIF France CGD \n",
"freq 1 3 \n",
" full_name short_name \\\n",
"count 1752 1752 \n",
"unique 1752 1741 \n",
"top Brassica Information Portal CGD \n",
"freq 1 3 \n",
"\n",
" fs_url \\\n",
"count 1752 \n",
"unique 1752 \n",
"top https://fairsharing.org/10.25504/FAIRsharing.w... \n",
"top https://fairsharing.org/10.25504/FAIRsharing.e... \n",
"freq 1 \n",
"\n",
" url countries subjects \n",
"count 1752 1749 1690 \n",
"unique 1752 178 834 \n",
"top http://www.ebi.ac.uk/merops/ United States Life Science \n",
"freq 1 588 367 "
" url countries subjects \n",
"count 1752 1749 1690 \n",
"unique 1752 178 834 \n",
"top http://web.iodp.tamu.edu/LORE/ United States Life Science \n",
"freq 1 588 367 "
]
},
"execution_count": 3,

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save