From c2943c48188e8fcb066775a8d46a5e96be1d6e87 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Fri, 23 Jul 2021 12:41:17 +0200 Subject: [PATCH] each registry has a basic analysis --- notebooks/01.2-exploration-opendoar.ipynb | 1493 ++++++------------ notebooks/01.4-exploration-fairsharing.ipynb | 1103 +------------ 2 files changed, 505 insertions(+), 2091 deletions(-) diff --git a/notebooks/01.2-exploration-opendoar.ipynb b/notebooks/01.2-exploration-opendoar.ipynb index b79a854..dcdad8e 100644 --- a/notebooks/01.2-exploration-opendoar.ipynb +++ b/notebooks/01.2-exploration-opendoar.ipynb @@ -34,13 +34,6 @@ "## Loading datasets" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**re3data**" - ] - }, { "cell_type": "code", "execution_count": 3, @@ -67,105 +60,206 @@ " \n", " \n", " \n", - " re3data_id\n", + " openaire_id\n", + " opendoar_id\n", " repository_name\n", + " additional_name\n", + " repository_url\n", + " description\n", " type\n", + " update_date\n", + " start_date\n", " subject\n", - " provider_type\n", - " keyword\n", + " content_type\n", " institution\n", + " metadata_policy\n", + " data_policy\n", + " submission_policy\n", + " content_policy\n", + " software\n", + " api\n", " \n", " \n", " \n", " \n", " 0\n", - " r3d100000001\n", - " Odum Institute Archive Dataverse\n", - " [disciplinary]\n", - " [1 Humanities and Social Sciences, 111 Social ...\n", - " [dataProvider]\n", - " [FAIR, Middle East, crime, demography, economy...\n", - " [[Odum Institute for Research in Social Scienc...\n", + " opendoar____::38b3eff8baf56627478ec76a704e9b52\n", + " 101\n", + " utrecht university repository\n", + " []\n", + " http://dspace.library.uu.nl\n", + " this site is a university repository providing...\n", + " institutional\n", + " 2021-04-16 15:22:03\n", + " 2006-01-13 12:55:13\n", + " [multidisciplinary]\n", + " [journal_articles, conference_and_workshop_pap...\n", + " [[university of utrecht, [universiteit utrecht...\n", + " True\n", + " True\n", + " False\n", + " True\n", + " dspace\n", + " true\n", " \n", " \n", " 1\n", - " r3d100000002\n", - " Access to Archival Databases\n", - " [disciplinary]\n", - " [1 Humanities and Social Sciences, 102 History...\n", - " [dataProvider]\n", - " [US History]\n", - " [[The U.S. National Archives and Records Admin...\n", + " opendoar____::2b44928ae11fb9384c4cf38708677c48\n", + " 115\n", + " dspace at indian institute of management kozhi...\n", + " [dspace@iimk]\n", + " http://dspace.iimk.ac.in/\n", + " this site is a subject based university reposi...\n", + " institutional\n", + " 2021-02-18 17:36:43\n", + " 2006-01-04 11:54:34\n", + " [ecology and environment, social sciences gene...\n", + " [journal_articles, conference_and_workshop_pap...\n", + " [[indian institute of management kozhikode, [i...\n", + " True\n", + " True\n", + " True\n", + " True\n", + " dspace 4.1\n", + " true\n", " \n", " \n", " 2\n", - " r3d100000004\n", - " Datenbank Gesprochenes Deutsch\n", - " [disciplinary]\n", - " [1 Humanities and Social Sciences, 104 Linguis...\n", - " [dataProvider, serviceProvider]\n", - " [Australian German, FOLK, German dialects, Pfe...\n", - " [[Institut für Deutsche Sprache, Archiv für Ge...\n", + " opendoar____::3416a75f4cea9109507cacd8e2f2aefc\n", + " 41\n", + " caltech engineering and science online\n", + " []\n", + " http://calteches.library.caltech.edu/\n", + " the caltech archives holds approximately 220 c...\n", + " institutional\n", + " 2021-02-18 17:36:28\n", + " 2006-01-04 14:47:04\n", + " [biology and biochemistry, chemistry and chemi...\n", + " [journal_articles, conference_and_workshop_pap...\n", + " [[california institute of technology, [caltech...\n", + " True\n", + " True\n", + " True\n", + " True\n", + " eprints 3.1.3\n", + " true\n", " \n", " \n", " 3\n", - " r3d100000005\n", - " UNC Dataverse\n", - " [institutional]\n", - " [1 Humanities and Social Sciences, 111 Social ...\n", - " [dataProvider, serviceProvider]\n", - " [FAIR, census, demographic survey, demography,...\n", - " [[Odum Institute for Research in Social Scienc...\n", + " opendoar____::07e1cd7dca89a1678042477183b7ac3f\n", + " 119\n", + " dcu online research access service\n", + " [doras]\n", + " http://doras.dcu.ie/\n", + " this site is an institutional repository provi...\n", + " institutional\n", + " 2021-02-18 17:36:44\n", + " 2006-01-04 11:15:19\n", + " [multidisciplinary]\n", + " [journal_articles, conference_and_workshop_pap...\n", + " [[dublin city university, [dcu], ie, [], , htt...\n", + " True\n", + " True\n", + " True\n", + " True\n", + " eprints 3.0.5\n", + " true\n", " \n", " \n", " 4\n", - " r3d100000006\n", - " Archaeology Data Service\n", - " [disciplinary]\n", - " [1 Humanities and Social Sciences, 101 Ancient...\n", - " [dataProvider, serviceProvider]\n", - " [FAIR, archaeology, cultural heritage, prehist...\n", - " [[Arts and Humanities Research Council, [AHRC]...\n", + " opendoar____::d1f491a404d6854880943e5c3cd9ca25\n", + " 129\n", + " earth-prints repository\n", + " []\n", + " http://www.earth-prints.org/\n", + " a subject based repository providing open acce...\n", + " disciplinary\n", + " 2021-04-19 08:28:38\n", + " 2006-01-30 16:43:11\n", + " [earth and planetary sciences]\n", + " [journal_articles, conference_and_workshop_pap...\n", + " [[istituto nazionale di geofisica e vulcanolog...\n", + " True\n", + " True\n", + " True\n", + " True\n", + " dspace 5.8.1-snapshot\n", + " true\n", " \n", " \n", "\n", "" ], "text/plain": [ - " re3data_id repository_name type \\\n", - "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n", - "1 r3d100000002 Access to Archival Databases [disciplinary] \n", - "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n", - "3 r3d100000005 UNC Dataverse [institutional] \n", - "4 r3d100000006 Archaeology Data Service [disciplinary] \n", + " openaire_id opendoar_id \\\n", + "0 opendoar____::38b3eff8baf56627478ec76a704e9b52 101 \n", + "1 opendoar____::2b44928ae11fb9384c4cf38708677c48 115 \n", + "2 opendoar____::3416a75f4cea9109507cacd8e2f2aefc 41 \n", + "3 opendoar____::07e1cd7dca89a1678042477183b7ac3f 119 \n", + "4 opendoar____::d1f491a404d6854880943e5c3cd9ca25 129 \n", + "\n", + " repository_name additional_name \\\n", + "0 utrecht university repository [] \n", + "1 dspace at indian institute of management kozhi... [dspace@iimk] \n", + "2 caltech engineering and science online [] \n", + "3 dcu online research access service [doras] \n", + "4 earth-prints repository [] \n", + "\n", + " repository_url \\\n", + "0 http://dspace.library.uu.nl \n", + "1 http://dspace.iimk.ac.in/ \n", + "2 http://calteches.library.caltech.edu/ \n", + "3 http://doras.dcu.ie/ \n", + "4 http://www.earth-prints.org/ \n", + "\n", + " description type \\\n", + "0 this site is a university repository providing... institutional \n", + "1 this site is a subject based university reposi... institutional \n", + "2 the caltech archives holds approximately 220 c... institutional \n", + "3 this site is an institutional repository provi... institutional \n", + "4 a subject based repository providing open acce... disciplinary \n", + "\n", + " update_date start_date \\\n", + "0 2021-04-16 15:22:03 2006-01-13 12:55:13 \n", + "1 2021-02-18 17:36:43 2006-01-04 11:54:34 \n", + "2 2021-02-18 17:36:28 2006-01-04 14:47:04 \n", + "3 2021-02-18 17:36:44 2006-01-04 11:15:19 \n", + "4 2021-04-19 08:28:38 2006-01-30 16:43:11 \n", "\n", " subject \\\n", - "0 [1 Humanities and Social Sciences, 111 Social ... \n", - "1 [1 Humanities and Social Sciences, 102 History... \n", - "2 [1 Humanities and Social Sciences, 104 Linguis... \n", - "3 [1 Humanities and Social Sciences, 111 Social ... \n", - "4 [1 Humanities and Social Sciences, 101 Ancient... \n", + "0 [multidisciplinary] \n", + "1 [ecology and environment, social sciences gene... \n", + "2 [biology and biochemistry, chemistry and chemi... \n", + "3 [multidisciplinary] \n", + "4 [earth and planetary sciences] \n", "\n", - " provider_type \\\n", - "0 [dataProvider] \n", - "1 [dataProvider] \n", - "2 [dataProvider, serviceProvider] \n", - "3 [dataProvider, serviceProvider] \n", - "4 [dataProvider, serviceProvider] \n", + " content_type \\\n", + "0 [journal_articles, conference_and_workshop_pap... \n", + "1 [journal_articles, conference_and_workshop_pap... \n", + "2 [journal_articles, conference_and_workshop_pap... \n", + "3 [journal_articles, conference_and_workshop_pap... \n", + "4 [journal_articles, conference_and_workshop_pap... \n", "\n", - " keyword \\\n", - "0 [FAIR, Middle East, crime, demography, economy... \n", - "1 [US History] \n", - "2 [Australian German, FOLK, German dialects, Pfe... \n", - "3 [FAIR, census, demographic survey, demography,... \n", - "4 [FAIR, archaeology, cultural heritage, prehist... \n", + " institution metadata_policy \\\n", + "0 [[university of utrecht, [universiteit utrecht... True \n", + "1 [[indian institute of management kozhikode, [i... True \n", + "2 [[california institute of technology, [caltech... True \n", + "3 [[dublin city university, [dcu], ie, [], , htt... True \n", + "4 [[istituto nazionale di geofisica e vulcanolog... True \n", "\n", - " institution \n", - "0 [[Odum Institute for Research in Social Scienc... \n", - "1 [[The U.S. National Archives and Records Admin... \n", - "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n", - "3 [[Odum Institute for Research in Social Scienc... \n", - "4 [[Arts and Humanities Research Council, [AHRC]... " + " data_policy submission_policy content_policy software \\\n", + "0 True False True dspace \n", + "1 True True True dspace 4.1 \n", + "2 True True True eprints 3.1.3 \n", + "3 True True True eprints 3.0.5 \n", + "4 True True True dspace 5.8.1-snapshot \n", + "\n", + " api \n", + "0 true \n", + "1 true \n", + "2 true \n", + "3 true \n", + "4 true " ] }, "execution_count": 3, @@ -173,262 +267,6 @@ "output_type": "execute_result" } ], - "source": [ - "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n", - " converters={'subject': ast.literal_eval,\n", - " 'keyword': ast.literal_eval,\n", - " 'additional_name': ast.literal_eval,\n", - " 'repository_id': ast.literal_eval,\n", - " 'type': ast.literal_eval,\n", - " 'content_type': ast.literal_eval,\n", - " 'provider_type': ast.literal_eval,\n", - " 'institution': ast.literal_eval\n", - " },\n", - " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n", - "re3data_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "re3data_df = re3data_df.explode('provider_type')\n", - "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
count2467246724672467245924672467
unique2466246391282122482447
topr3d100011987Landmap[disciplinary][1 Humanities and Social Sciences, 2 Life Scie...dataProvider[multidisciplinary][[National Center for Biotechnology Informatio...
freq22157320024591816
\n", - "
" - ], - "text/plain": [ - " re3data_id repository_name type \\\n", - "count 2467 2467 2467 \n", - "unique 2466 2463 9 \n", - "top r3d100011987 Landmap [disciplinary] \n", - "freq 2 2 1573 \n", - "\n", - " subject provider_type \\\n", - "count 2467 2459 \n", - "unique 1282 1 \n", - "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n", - "freq 200 2459 \n", - "\n", - " keyword institution \n", - "count 2467 2467 \n", - "unique 2248 2447 \n", - "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n", - "freq 181 6 " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re3data_df.describe(include='all')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**openDOAR**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
opendoar_idrepository_nametypesubjectinstitution
0101utrecht university repositoryinstitutional[multidisciplinary][[university of utrecht, [universiteit utrecht...
1115dspace at indian institute of management kozhi...institutional[ecology and environment, social sciences gene...[[indian institute of management kozhikode, [i...
241caltech engineering and science onlineinstitutional[biology and biochemistry, chemistry and chemi...[[california institute of technology, [caltech...
3119dcu online research access serviceinstitutional[multidisciplinary][[dublin city university, [dcu], ie, [], , htt...
4129earth-prints repositorydisciplinary[earth and planetary sciences][[istituto nazionale di geofisica e vulcanolog...
\n", - "
" - ], - "text/plain": [ - " opendoar_id repository_name \\\n", - "0 101 utrecht university repository \n", - "1 115 dspace at indian institute of management kozhi... \n", - "2 41 caltech engineering and science online \n", - "3 119 dcu online research access service \n", - "4 129 earth-prints repository \n", - "\n", - " type subject \\\n", - "0 institutional [multidisciplinary] \n", - "1 institutional [ecology and environment, social sciences gene... \n", - "2 institutional [biology and biochemistry, chemistry and chemi... \n", - "3 institutional [multidisciplinary] \n", - "4 disciplinary [earth and planetary sciences] \n", - "\n", - " institution \n", - "0 [[university of utrecht, [universiteit utrecht... \n", - "1 [[indian institute of management kozhikode, [i... \n", - "2 [[california institute of technology, [caltech... \n", - "3 [[dublin city university, [dcu], ie, [], , htt... \n", - "4 [[istituto nazionale di geofisica e vulcanolog... " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n", " converters={'subject': ast.literal_eval,\n", @@ -436,14 +274,13 @@ " 'opendoar_id': ast.literal_eval,\n", " 'content_type': ast.literal_eval,\n", " 'institution': ast.literal_eval\n", - " },\n", - " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n", + " })\n", "opendoar_df.head()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -467,135 +304,369 @@ " \n", " \n", " \n", + " openaire_id\n", " opendoar_id\n", " repository_name\n", + " additional_name\n", + " repository_url\n", + " description\n", " type\n", + " update_date\n", + " start_date\n", " subject\n", + " content_type\n", " institution\n", + " metadata_policy\n", + " data_policy\n", + " submission_policy\n", + " content_policy\n", + " software\n", + " api\n", " \n", " \n", " \n", " \n", " count\n", + " 5707\n", " 5707.000000\n", " 5707\n", " 5707\n", " 5707\n", + " 5425\n", + " 5707\n", + " 5707\n", + " 5707\n", + " 5707\n", + " 5707\n", + " 5707\n", + " 5707\n", + " 5707\n", + " 5707\n", + " 5707\n", + " 5707\n", " 5707\n", " \n", " \n", " unique\n", + " 5707\n", " NaN\n", " 5670\n", + " 2097\n", + " 5670\n", + " 4622\n", " 4\n", + " 2501\n", + " 5538\n", " 820\n", + " 477\n", " 5098\n", + " 2\n", + " 2\n", + " 2\n", + " 2\n", + " 321\n", + " 2\n", " \n", " \n", " top\n", + " opendoar____::3cf166c6b73f030b4f67eeaeba301103\n", " NaN\n", - " arch\n", + " hiroshima associated repository portal\n", + " []\n", + " http://harp.lib.hiroshima-u.ac.jp/\n", + " this site provides access to the research outp...\n", " institutional\n", + " 2020-09-18 12:53:48\n", + " 2020-09-18 12:53:48\n", " [multidisciplinary]\n", + " [theses_and_dissertations]\n", " [[rijksuniversiteit groningen, [rug], nl, [], ...\n", + " False\n", + " False\n", + " False\n", + " False\n", + " dspace\n", + " true\n", " \n", " \n", " freq\n", + " 1\n", " NaN\n", " 3\n", + " 3569\n", + " 3\n", + " 95\n", " 5067\n", + " 82\n", + " 82\n", " 3212\n", + " 460\n", " 26\n", + " 4116\n", + " 4101\n", + " 5016\n", + " 4075\n", + " 800\n", + " 4374\n", " \n", " \n", " mean\n", + " NaN\n", " 4008.118801\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " std\n", + " NaN\n", " 2869.948770\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " min\n", + " NaN\n", " 2.000000\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 25%\n", + " NaN\n", " 1823.000000\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 50%\n", + " NaN\n", " 3361.000000\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 75%\n", + " NaN\n", " 5095.000000\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " max\n", + " NaN\n", " 10175.000000\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", "\n", "" ], "text/plain": [ - " opendoar_id repository_name type subject \\\n", - "count 5707.000000 5707 5707 5707 \n", - "unique NaN 5670 4 820 \n", - "top NaN arch institutional [multidisciplinary] \n", - "freq NaN 3 5067 3212 \n", - "mean 4008.118801 NaN NaN NaN \n", - "std 2869.948770 NaN NaN NaN \n", - "min 2.000000 NaN NaN NaN \n", - "25% 1823.000000 NaN NaN NaN \n", - "50% 3361.000000 NaN NaN NaN \n", - "75% 5095.000000 NaN NaN NaN \n", - "max 10175.000000 NaN NaN NaN \n", + " openaire_id opendoar_id \\\n", + "count 5707 5707.000000 \n", + "unique 5707 NaN \n", + "top opendoar____::3cf166c6b73f030b4f67eeaeba301103 NaN \n", + "freq 1 NaN \n", + "mean NaN 4008.118801 \n", + "std NaN 2869.948770 \n", + "min NaN 2.000000 \n", + "25% NaN 1823.000000 \n", + "50% NaN 3361.000000 \n", + "75% NaN 5095.000000 \n", + "max NaN 10175.000000 \n", "\n", - " institution \n", - "count 5707 \n", - "unique 5098 \n", - "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n", - "freq 26 \n", - "mean NaN \n", - "std NaN \n", - "min NaN \n", - "25% NaN \n", - "50% NaN \n", - "75% NaN \n", - "max NaN " + " repository_name additional_name \\\n", + "count 5707 5707 \n", + "unique 5670 2097 \n", + "top hiroshima associated repository portal [] \n", + "freq 3 3569 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " repository_url \\\n", + "count 5707 \n", + "unique 5670 \n", + "top http://harp.lib.hiroshima-u.ac.jp/ \n", + "freq 3 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " description type \\\n", + "count 5425 5707 \n", + "unique 4622 4 \n", + "top this site provides access to the research outp... institutional \n", + "freq 95 5067 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " update_date start_date subject \\\n", + "count 5707 5707 5707 \n", + "unique 2501 5538 820 \n", + "top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n", + "freq 82 82 3212 \n", + "mean NaN NaN NaN \n", + "std NaN NaN NaN \n", + "min NaN NaN NaN \n", + "25% NaN NaN NaN \n", + "50% NaN NaN NaN \n", + "75% NaN NaN NaN \n", + "max NaN NaN NaN \n", + "\n", + " content_type \\\n", + "count 5707 \n", + "unique 477 \n", + "top [theses_and_dissertations] \n", + "freq 460 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " institution metadata_policy \\\n", + "count 5707 5707 \n", + "unique 5098 2 \n", + "top [[rijksuniversiteit groningen, [rug], nl, [], ... False \n", + "freq 26 4116 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " data_policy submission_policy content_policy software api \n", + "count 5707 5707 5707 5707 5707 \n", + "unique 2 2 2 321 2 \n", + "top False False False dspace true \n", + "freq 4101 5016 4075 800 4374 \n", + "mean NaN NaN NaN NaN NaN \n", + "std NaN NaN NaN NaN NaN \n", + "min NaN NaN NaN NaN NaN \n", + "25% NaN NaN NaN NaN NaN \n", + "50% NaN NaN NaN NaN NaN \n", + "75% NaN NaN NaN NaN NaN \n", + "max NaN NaN NaN NaN NaN " ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -604,706 +675,50 @@ "opendoar_df.describe(include='all')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**ROAR**" - ] - }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
eprintidhome_pagetitlelocation_countrysubjects
0921http://alcme.oclc.org/ndltd/index.htmlNetworked Digital Library of Theses and Disser...usNaN
11489http://prensahistorica.mcu.es/prensahistorica/...Virtual Library of Historical PressesNaN
2606http://hal.archives-ouvertes.fr/HAL: Hyper Article en LignefrNaN
3606NaNNaNNaNNaN
4606NaNNaNNaNNaN
\n", - "
" - ], "text/plain": [ - " eprintid home_page \\\n", - "0 921 http://alcme.oclc.org/ndltd/index.html \n", - "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n", - "2 606 http://hal.archives-ouvertes.fr/ \n", - "3 606 NaN \n", - "4 606 NaN \n", - "\n", - " title location_country subjects \n", - "0 Networked Digital Library of Theses and Disser... us NaN \n", - "1 Virtual Library of Historical Press es NaN \n", - "2 HAL: Hyper Article en Ligne fr NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN " + "openaire_id 0\n", + "opendoar_id 0\n", + "repository_name 0\n", + "additional_name 0\n", + "repository_url 0\n", + "description 282\n", + "type 0\n", + "update_date 0\n", + "start_date 0\n", + "subject 0\n", + "content_type 0\n", + "institution 0\n", + "metadata_policy 0\n", + "data_policy 0\n", + "submission_policy 0\n", + "content_policy 0\n", + "software 0\n", + "api 0\n", + "dtype: int64" ] }, - "execution_count": 45, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n", - " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n", - "roar_df.head()" + "opendoar_df.isna().sum()" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
eprintidhome_pagetitlelocation_countrysubjects
1062303NaNFaculty Scholarship at The Claremont CollegesusAS
1072303NaNNaNNaNBF
1082303NaNNaNNaNBL
1092303NaNNaNNaNCC
1102303NaNNaNNaNGN
1112303NaNNaNNaNH1
1122303NaNNaNNaNHB
1132303NaNNaNNaNJA
1142303NaNNaNNaNLB
1152303NaNNaNNaNNX
1162303NaNNaNNaNPQ
1172303NaNNaNNaNQA
\n", - "
" - ], - "text/plain": [ - " eprintid home_page title \\\n", - "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n", - "107 2303 NaN NaN \n", - "108 2303 NaN NaN \n", - "109 2303 NaN NaN \n", - "110 2303 NaN NaN \n", - "111 2303 NaN NaN \n", - "112 2303 NaN NaN \n", - "113 2303 NaN NaN \n", - "114 2303 NaN NaN \n", - "115 2303 NaN NaN \n", - "116 2303 NaN NaN \n", - "117 2303 NaN NaN \n", - "\n", - " location_country subjects \n", - "106 us AS \n", - "107 NaN BF \n", - "108 NaN BL \n", - "109 NaN CC \n", - "110 NaN GN \n", - "111 NaN H1 \n", - "112 NaN HB \n", - "113 NaN JA \n", - "114 NaN LB \n", - "115 NaN NX \n", - "116 NaN PQ \n", - "117 NaN QA " - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roar_df[roar_df.eprintid == 2303]" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
eprintidhome_pagetitlelocation_countrysubjects
count5314.0000005263526850241225
uniqueNaN51565027134123
topNaNhttp://ir.lib.isu.edu.tw/Repositorio InstitucionalusH1
freqNaN37877147
mean6389.464434NaNNaNNaNNaN
std5159.573937NaNNaNNaNNaN
min1.000000NaNNaNNaNNaN
25%1490.250000NaNNaNNaNNaN
50%4990.500000NaNNaNNaNNaN
75%10452.750000NaNNaNNaNNaN
max17302.000000NaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " eprintid home_page title \\\n", - "count 5314.000000 5263 5268 \n", - "unique NaN 5156 5027 \n", - "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n", - "freq NaN 3 7 \n", - "mean 6389.464434 NaN NaN \n", - "std 5159.573937 NaN NaN \n", - "min 1.000000 NaN NaN \n", - "25% 1490.250000 NaN NaN \n", - "50% 4990.500000 NaN NaN \n", - "75% 10452.750000 NaN NaN \n", - "max 17302.000000 NaN NaN \n", - "\n", - " location_country subjects \n", - "count 5024 1225 \n", - "unique 134 123 \n", - "top us H1 \n", - "freq 877 147 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN " - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roar_df.describe(include='all')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**FAIRsharing**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
full_nameshort_namefs_urlurlcountriessubjects
0GenBankGenBankhttps://fairsharing.org/10.25504/FAIRsharing.9...https://www.ncbi.nlm.nih.gov/genbank/European Union,Japan,United StatesBioinformatics,Data Management,Data Submission...
1GlycoNAVIGlycoNAVIhttps://fairsharing.org/10.25504/FAIRsharing.w...https://glyconavi.org/JapanChemistry,Glycomics,Life Science,Organic Chemi...
2ADHDgeneADHDgenehttps://fairsharing.org/10.25504/FAIRsharing.m...http://adhd.psych.ac.cn/ChinaBiomedical Science,Genetics
3Allele frequency resource for research and tea...ALFREDhttps://fairsharing.org/10.25504/FAIRsharing.y...http://alfred.med.yale.eduUnited StatesLife Science
4Animal Transcription Factor DatabaseAnimalTFDBhttps://fairsharing.org/10.25504/FAIRsharing.e...http://bioinfo.life.hust.edu.cn/AnimalTFDB/ChinaLife Science
\n", - "
" - ], - "text/plain": [ - " full_name short_name \\\n", - "0 GenBank GenBank \n", - "1 GlycoNAVI GlycoNAVI \n", - "2 ADHDgene ADHDgene \n", - "3 Allele frequency resource for research and tea... ALFRED \n", - "4 Animal Transcription Factor Database AnimalTFDB \n", - "\n", - " fs_url \\\n", - "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n", - "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n", - "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n", - "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n", - "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n", - "\n", - " url \\\n", - "0 https://www.ncbi.nlm.nih.gov/genbank/ \n", - "1 https://glyconavi.org/ \n", - "2 http://adhd.psych.ac.cn/ \n", - "3 http://alfred.med.yale.edu \n", - "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n", - "\n", - " countries \\\n", - "0 European Union,Japan,United States \n", - "1 Japan \n", - "2 China \n", - "3 United States \n", - "4 China \n", - "\n", - " subjects \n", - "0 Bioinformatics,Data Management,Data Submission... \n", - "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n", - "2 Biomedical Science,Genetics \n", - "3 Life Science \n", - "4 Life Science " - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n", - " delimiter='|', header=0,\n", - " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n", - "fairsharing_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
full_nameshort_namefs_urlurlcountriessubjects
count175217521752175217491690
unique1752174117521752178834
topCiteAbCGDhttps://fairsharing.org/10.25504/FAIRsharing.1...http://www.plexdb.org/United StatesLife Science
freq1311588367
\n", - "
" - ], - "text/plain": [ - " full_name short_name \\\n", - "count 1752 1752 \n", - "unique 1752 1741 \n", - "top CiteAb CGD \n", - "freq 1 3 \n", - "\n", - " fs_url \\\n", - "count 1752 \n", - "unique 1752 \n", - "top https://fairsharing.org/10.25504/FAIRsharing.1... \n", - "freq 1 \n", - "\n", - " url countries subjects \n", - "count 1752 1749 1690 \n", - "unique 1752 178 834 \n", - "top http://www.plexdb.org/ United States Life Science \n", - "freq 1 588 367 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fairsharing_df.describe(include='all')" - ] + "source": [] } ], "metadata": { diff --git a/notebooks/01.4-exploration-fairsharing.ipynb b/notebooks/01.4-exploration-fairsharing.ipynb index b79a854..3b2334a 100644 --- a/notebooks/01.4-exploration-fairsharing.ipynb +++ b/notebooks/01.4-exploration-fairsharing.ipynb @@ -34,1043 +34,9 @@ "## Loading datasets" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**re3data**" - ] - }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
0r3d100000001Odum Institute Archive Dataverse[disciplinary][1 Humanities and Social Sciences, 111 Social ...[dataProvider][FAIR, Middle East, crime, demography, economy...[[Odum Institute for Research in Social Scienc...
1r3d100000002Access to Archival Databases[disciplinary][1 Humanities and Social Sciences, 102 History...[dataProvider][US History][[The U.S. National Archives and Records Admin...
2r3d100000004Datenbank Gesprochenes Deutsch[disciplinary][1 Humanities and Social Sciences, 104 Linguis...[dataProvider, serviceProvider][Australian German, FOLK, German dialects, Pfe...[[Institut für Deutsche Sprache, Archiv für Ge...
3r3d100000005UNC Dataverse[institutional][1 Humanities and Social Sciences, 111 Social ...[dataProvider, serviceProvider][FAIR, census, demographic survey, demography,...[[Odum Institute for Research in Social Scienc...
4r3d100000006Archaeology Data Service[disciplinary][1 Humanities and Social Sciences, 101 Ancient...[dataProvider, serviceProvider][FAIR, archaeology, cultural heritage, prehist...[[Arts and Humanities Research Council, [AHRC]...
\n", - "
" - ], - "text/plain": [ - " re3data_id repository_name type \\\n", - "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n", - "1 r3d100000002 Access to Archival Databases [disciplinary] \n", - "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n", - "3 r3d100000005 UNC Dataverse [institutional] \n", - "4 r3d100000006 Archaeology Data Service [disciplinary] \n", - "\n", - " subject \\\n", - "0 [1 Humanities and Social Sciences, 111 Social ... \n", - "1 [1 Humanities and Social Sciences, 102 History... \n", - "2 [1 Humanities and Social Sciences, 104 Linguis... \n", - "3 [1 Humanities and Social Sciences, 111 Social ... \n", - "4 [1 Humanities and Social Sciences, 101 Ancient... \n", - "\n", - " provider_type \\\n", - "0 [dataProvider] \n", - "1 [dataProvider] \n", - "2 [dataProvider, serviceProvider] \n", - "3 [dataProvider, serviceProvider] \n", - "4 [dataProvider, serviceProvider] \n", - "\n", - " keyword \\\n", - "0 [FAIR, Middle East, crime, demography, economy... \n", - "1 [US History] \n", - "2 [Australian German, FOLK, German dialects, Pfe... \n", - "3 [FAIR, census, demographic survey, demography,... \n", - "4 [FAIR, archaeology, cultural heritage, prehist... \n", - "\n", - " institution \n", - "0 [[Odum Institute for Research in Social Scienc... \n", - "1 [[The U.S. National Archives and Records Admin... \n", - "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n", - "3 [[Odum Institute for Research in Social Scienc... \n", - "4 [[Arts and Humanities Research Council, [AHRC]... " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n", - " converters={'subject': ast.literal_eval,\n", - " 'keyword': ast.literal_eval,\n", - " 'additional_name': ast.literal_eval,\n", - " 'repository_id': ast.literal_eval,\n", - " 'type': ast.literal_eval,\n", - " 'content_type': ast.literal_eval,\n", - " 'provider_type': ast.literal_eval,\n", - " 'institution': ast.literal_eval\n", - " },\n", - " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n", - "re3data_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "re3data_df = re3data_df.explode('provider_type')\n", - "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
count2467246724672467245924672467
unique2466246391282122482447
topr3d100011987Landmap[disciplinary][1 Humanities and Social Sciences, 2 Life Scie...dataProvider[multidisciplinary][[National Center for Biotechnology Informatio...
freq22157320024591816
\n", - "
" - ], - "text/plain": [ - " re3data_id repository_name type \\\n", - "count 2467 2467 2467 \n", - "unique 2466 2463 9 \n", - "top r3d100011987 Landmap [disciplinary] \n", - "freq 2 2 1573 \n", - "\n", - " subject provider_type \\\n", - "count 2467 2459 \n", - "unique 1282 1 \n", - "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n", - "freq 200 2459 \n", - "\n", - " keyword institution \n", - "count 2467 2467 \n", - "unique 2248 2447 \n", - "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n", - "freq 181 6 " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re3data_df.describe(include='all')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**openDOAR**" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
opendoar_idrepository_nametypesubjectinstitution
0101utrecht university repositoryinstitutional[multidisciplinary][[university of utrecht, [universiteit utrecht...
1115dspace at indian institute of management kozhi...institutional[ecology and environment, social sciences gene...[[indian institute of management kozhikode, [i...
241caltech engineering and science onlineinstitutional[biology and biochemistry, chemistry and chemi...[[california institute of technology, [caltech...
3119dcu online research access serviceinstitutional[multidisciplinary][[dublin city university, [dcu], ie, [], , htt...
4129earth-prints repositorydisciplinary[earth and planetary sciences][[istituto nazionale di geofisica e vulcanolog...
\n", - "
" - ], - "text/plain": [ - " opendoar_id repository_name \\\n", - "0 101 utrecht university repository \n", - "1 115 dspace at indian institute of management kozhi... \n", - "2 41 caltech engineering and science online \n", - "3 119 dcu online research access service \n", - "4 129 earth-prints repository \n", - "\n", - " type subject \\\n", - "0 institutional [multidisciplinary] \n", - "1 institutional [ecology and environment, social sciences gene... \n", - "2 institutional [biology and biochemistry, chemistry and chemi... \n", - "3 institutional [multidisciplinary] \n", - "4 disciplinary [earth and planetary sciences] \n", - "\n", - " institution \n", - "0 [[university of utrecht, [universiteit utrecht... \n", - "1 [[indian institute of management kozhikode, [i... \n", - "2 [[california institute of technology, [caltech... \n", - "3 [[dublin city university, [dcu], ie, [], , htt... \n", - "4 [[istituto nazionale di geofisica e vulcanolog... " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n", - " converters={'subject': ast.literal_eval,\n", - " 'additional_name': ast.literal_eval,\n", - " 'opendoar_id': ast.literal_eval,\n", - " 'content_type': ast.literal_eval,\n", - " 'institution': ast.literal_eval\n", - " },\n", - " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n", - "opendoar_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
opendoar_idrepository_nametypesubjectinstitution
count5707.0000005707570757075707
uniqueNaN567048205098
topNaNarchinstitutional[multidisciplinary][[rijksuniversiteit groningen, [rug], nl, [], ...
freqNaN35067321226
mean4008.118801NaNNaNNaNNaN
std2869.948770NaNNaNNaNNaN
min2.000000NaNNaNNaNNaN
25%1823.000000NaNNaNNaNNaN
50%3361.000000NaNNaNNaNNaN
75%5095.000000NaNNaNNaNNaN
max10175.000000NaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " opendoar_id repository_name type subject \\\n", - "count 5707.000000 5707 5707 5707 \n", - "unique NaN 5670 4 820 \n", - "top NaN arch institutional [multidisciplinary] \n", - "freq NaN 3 5067 3212 \n", - "mean 4008.118801 NaN NaN NaN \n", - "std 2869.948770 NaN NaN NaN \n", - "min 2.000000 NaN NaN NaN \n", - "25% 1823.000000 NaN NaN NaN \n", - "50% 3361.000000 NaN NaN NaN \n", - "75% 5095.000000 NaN NaN NaN \n", - "max 10175.000000 NaN NaN NaN \n", - "\n", - " institution \n", - "count 5707 \n", - "unique 5098 \n", - "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n", - "freq 26 \n", - "mean NaN \n", - "std NaN \n", - "min NaN \n", - "25% NaN \n", - "50% NaN \n", - "75% NaN \n", - "max NaN " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "opendoar_df.describe(include='all')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**ROAR**" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
eprintidhome_pagetitlelocation_countrysubjects
0921http://alcme.oclc.org/ndltd/index.htmlNetworked Digital Library of Theses and Disser...usNaN
11489http://prensahistorica.mcu.es/prensahistorica/...Virtual Library of Historical PressesNaN
2606http://hal.archives-ouvertes.fr/HAL: Hyper Article en LignefrNaN
3606NaNNaNNaNNaN
4606NaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " eprintid home_page \\\n", - "0 921 http://alcme.oclc.org/ndltd/index.html \n", - "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n", - "2 606 http://hal.archives-ouvertes.fr/ \n", - "3 606 NaN \n", - "4 606 NaN \n", - "\n", - " title location_country subjects \n", - "0 Networked Digital Library of Theses and Disser... us NaN \n", - "1 Virtual Library of Historical Press es NaN \n", - "2 HAL: Hyper Article en Ligne fr NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN " - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n", - " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n", - "roar_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
eprintidhome_pagetitlelocation_countrysubjects
1062303NaNFaculty Scholarship at The Claremont CollegesusAS
1072303NaNNaNNaNBF
1082303NaNNaNNaNBL
1092303NaNNaNNaNCC
1102303NaNNaNNaNGN
1112303NaNNaNNaNH1
1122303NaNNaNNaNHB
1132303NaNNaNNaNJA
1142303NaNNaNNaNLB
1152303NaNNaNNaNNX
1162303NaNNaNNaNPQ
1172303NaNNaNNaNQA
\n", - "
" - ], - "text/plain": [ - " eprintid home_page title \\\n", - "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n", - "107 2303 NaN NaN \n", - "108 2303 NaN NaN \n", - "109 2303 NaN NaN \n", - "110 2303 NaN NaN \n", - "111 2303 NaN NaN \n", - "112 2303 NaN NaN \n", - "113 2303 NaN NaN \n", - "114 2303 NaN NaN \n", - "115 2303 NaN NaN \n", - "116 2303 NaN NaN \n", - "117 2303 NaN NaN \n", - "\n", - " location_country subjects \n", - "106 us AS \n", - "107 NaN BF \n", - "108 NaN BL \n", - "109 NaN CC \n", - "110 NaN GN \n", - "111 NaN H1 \n", - "112 NaN HB \n", - "113 NaN JA \n", - "114 NaN LB \n", - "115 NaN NX \n", - "116 NaN PQ \n", - "117 NaN QA " - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roar_df[roar_df.eprintid == 2303]" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
eprintidhome_pagetitlelocation_countrysubjects
count5314.0000005263526850241225
uniqueNaN51565027134123
topNaNhttp://ir.lib.isu.edu.tw/Repositorio InstitucionalusH1
freqNaN37877147
mean6389.464434NaNNaNNaNNaN
std5159.573937NaNNaNNaNNaN
min1.000000NaNNaNNaNNaN
25%1490.250000NaNNaNNaNNaN
50%4990.500000NaNNaNNaNNaN
75%10452.750000NaNNaNNaNNaN
max17302.000000NaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " eprintid home_page title \\\n", - "count 5314.000000 5263 5268 \n", - "unique NaN 5156 5027 \n", - "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n", - "freq NaN 3 7 \n", - "mean 6389.464434 NaN NaN \n", - "std 5159.573937 NaN NaN \n", - "min 1.000000 NaN NaN \n", - "25% 1490.250000 NaN NaN \n", - "50% 4990.500000 NaN NaN \n", - "75% 10452.750000 NaN NaN \n", - "max 17302.000000 NaN NaN \n", - "\n", - " location_country subjects \n", - "count 5024 1225 \n", - "unique 134 123 \n", - "top us H1 \n", - "freq 877 147 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN " - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roar_df.describe(include='all')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**FAIRsharing**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -1189,7 +155,7 @@ "4 Life Science " ] }, - "execution_count": 11, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -1203,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -1256,10 +222,10 @@ " \n", " \n", " top\n", - " CiteAb\n", + " GBIF France IPT - GBIF France\n", " CGD\n", - " https://fairsharing.org/10.25504/FAIRsharing.1...\n", - " http://www.plexdb.org/\n", + " https://fairsharing.org/10.25504/FAIRsharing.w...\n", + " http://www.ebi.ac.uk/merops/\n", " United States\n", " Life Science\n", " \n", @@ -1277,26 +243,26 @@ "" ], "text/plain": [ - " full_name short_name \\\n", - "count 1752 1752 \n", - "unique 1752 1741 \n", - "top CiteAb CGD \n", - "freq 1 3 \n", + " full_name short_name \\\n", + "count 1752 1752 \n", + "unique 1752 1741 \n", + "top GBIF France IPT - GBIF France CGD \n", + "freq 1 3 \n", "\n", " fs_url \\\n", "count 1752 \n", "unique 1752 \n", - "top https://fairsharing.org/10.25504/FAIRsharing.1... \n", + "top https://fairsharing.org/10.25504/FAIRsharing.w... \n", "freq 1 \n", "\n", - " url countries subjects \n", - "count 1752 1749 1690 \n", - "unique 1752 178 834 \n", - "top http://www.plexdb.org/ United States Life Science \n", - "freq 1 588 367 " + " url countries subjects \n", + "count 1752 1749 1690 \n", + "unique 1752 178 834 \n", + "top http://www.ebi.ac.uk/merops/ United States Life Science \n", + "freq 1 588 367 " ] }, - "execution_count": 12, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -1304,6 +270,39 @@ "source": [ "fairsharing_df.describe(include='all')" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "full_name 0\n", + "short_name 0\n", + "fs_url 0\n", + "url 0\n", + "countries 3\n", + "subjects 62\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {