diff --git a/notebooks/01.1-exploration-re3data.ipynb b/notebooks/01.1-exploration-re3data.ipynb index 5bfe9b5..5057a70 100644 --- a/notebooks/01.1-exploration-re3data.ipynb +++ b/notebooks/01.1-exploration-re3data.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -452,7 +452,7 @@ "4 2021-06-11 " ] }, - "execution_count": 20, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -473,7 +473,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -492,7 +492,7 @@ " dtype='object')" ] }, - "execution_count": 14, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -503,7 +503,22 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def empty_list_is_nan(cell):\n", + " if isinstance(cell, list):\n", + " return np.nan if len(cell) == 0 else cell\n", + " else:\n", + " return cell\n", + " \n", + "re3data_df = re3data_df.applymap(empty_list_is_nan)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -574,21 +589,21 @@ " 2707\n", " 2707\n", " 2707\n", - " 2707\n", + " 2137\n", " 2686\n", + " 829\n", " 2707\n", - " 2707\n", - " 2707\n", + " 2677\n", " 1260\n", " 1248\n", " 1762\n", " 146\n", + " 2685\n", " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", - " 2707\n", + " 2700\n", + " 2699\n", + " 2699\n", + " 2706\n", " 2707\n", " 2707\n", " 2707\n", @@ -616,21 +631,21 @@ " 2707\n", " 2707\n", " 2704\n", - " 2129\n", + " 2128\n", " 2683\n", - " 829\n", + " 828\n", " 2705\n", - " 9\n", + " 8\n", " 1233\n", " 687\n", " 351\n", " 79\n", - " 1368\n", + " 1367\n", " 2\n", - " 1324\n", - " 5\n", - " 2475\n", - " 2686\n", + " 1323\n", + " 4\n", + " 2474\n", + " 2685\n", " 2\n", " 1\n", " 2\n", @@ -655,12 +670,12 @@ " \n", " \n", " top\n", - " re3data_____::d8e2164dd005d3961c23e0762453cfb1\n", - " r3d100010836\n", - " UCLA Social Science Data Archive Dataverse\n", - " []\n", + " re3data_____::4cea5a5ea78542232a51190879756661\n", + " r3d100011254\n", + " EarthChem Library\n", + " [IRIS]\n", " http://www.jcvi.org/cms/home/\n", - " []\n", + " [doi:10.17171/1-6]\n", " The repository is no longer available. >>>!!!<...\n", " [disciplinary]\n", " 2 datasets\n", @@ -700,9 +715,9 @@ " 1\n", " 1\n", " 2\n", - " 570\n", " 2\n", - " 1878\n", + " 2\n", + " 2\n", " 2\n", " 1713\n", " 6\n", @@ -745,48 +760,42 @@ " openaire_id re3data_id \\\n", "count 2707 2707 \n", "unique 2707 2707 \n", - "top re3data_____::d8e2164dd005d3961c23e0762453cfb1 r3d100010836 \n", + "top re3data_____::4cea5a5ea78542232a51190879756661 r3d100011254 \n", "freq 1 1 \n", "\n", - " repository_name additional_name \\\n", - "count 2707 2707 \n", - "unique 2704 2129 \n", - "top UCLA Social Science Data Archive Dataverse [] \n", - "freq 2 570 \n", + " repository_name additional_name repository_url \\\n", + "count 2707 2137 2686 \n", + "unique 2704 2128 2683 \n", + "top EarthChem Library [IRIS] http://www.jcvi.org/cms/home/ \n", + "freq 2 2 2 \n", "\n", - " repository_url repository_id \\\n", - "count 2686 2707 \n", - "unique 2683 829 \n", - "top http://www.jcvi.org/cms/home/ [] \n", - "freq 2 1878 \n", + " repository_id description \\\n", + "count 829 2707 \n", + "unique 828 2705 \n", + "top [doi:10.17171/1-6] The repository is no longer available. >>>!!!<... \n", + "freq 2 2 \n", "\n", - " description type \\\n", - "count 2707 2707 \n", - "unique 2705 9 \n", - "top The repository is no longer available. >>>!!!<... [disciplinary] \n", - "freq 2 1713 \n", - "\n", - " size update_date start_date end_date \\\n", - "count 1260 1248 1762 146 \n", - "unique 1233 687 351 79 \n", - "top 2 datasets 2019-05-15 2008 2015 \n", - "freq 6 15 92 11 \n", + " type size update_date start_date end_date \\\n", + "count 2677 1260 1248 1762 146 \n", + "unique 8 1233 687 351 79 \n", + "top [disciplinary] 2 datasets 2019-05-15 2008 2015 \n", + "freq 1713 6 15 92 11 \n", "\n", " subject mission_statement \\\n", - "count 2707 2707 \n", - "unique 1368 2 \n", + "count 2685 2707 \n", + "unique 1367 2 \n", "top [1 Humanities and Social Sciences, 2 Life Scie... true \n", "freq 222 2286 \n", "\n", " content_type provider_type keyword \\\n", - "count 2707 2707 2707 \n", - "unique 1324 5 2475 \n", + "count 2700 2699 2699 \n", + "unique 1323 4 2474 \n", "top [Standard office documents] [dataProvider] [multidisciplinary] \n", "freq 30 1748 190 \n", "\n", " institution policy \\\n", - "count 2707 2707 \n", - "unique 2686 2 \n", + "count 2706 2707 \n", + "unique 2685 2 \n", "top [[National Center for Biotechnology Informatio... true \n", "freq 6 2394 \n", "\n", @@ -827,7 +836,7 @@ "freq 47 " ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -838,7 +847,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -847,21 +856,21 @@ "openaire_id 0\n", "re3data_id 0\n", "repository_name 0\n", - "additional_name 0\n", + "additional_name 570\n", "repository_url 21\n", - "repository_id 0\n", + "repository_id 1878\n", "description 0\n", - "type 0\n", + "type 30\n", "size 1447\n", "update_date 1459\n", "start_date 945\n", "end_date 2561\n", - "subject 0\n", + "subject 22\n", "mission_statement 0\n", - "content_type 0\n", - "provider_type 0\n", - "keyword 0\n", - "institution 0\n", + "content_type 7\n", + "provider_type 8\n", + "keyword 8\n", + "institution 1\n", "policy 0\n", "database_access 0\n", "database_license 0\n", @@ -886,7 +895,7 @@ "dtype: int64" ] }, - "execution_count": 10, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -897,7 +906,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -911,7 +920,7 @@ " 'Configuration data', 'Networkbased data', nan], dtype=object)" ] }, - "execution_count": 18, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -922,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -931,7 +940,7 @@ "array(['dataProvider', 'serviceProvider', nan], dtype=object)" ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -939,13 +948,6 @@ "source": [ "re3data_df.provider_type.explode().unique()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/notebooks/01.2-exploration-opendoar.ipynb b/notebooks/01.2-exploration-opendoar.ipynb index dcdad8e..49cc030 100644 --- a/notebooks/01.2-exploration-opendoar.ipynb +++ b/notebooks/01.2-exploration-opendoar.ipynb @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -262,7 +262,7 @@ "4 true " ] }, - "execution_count": 3, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -280,7 +280,47 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n", + " 'repository_url', 'description', 'type', 'update_date', 'start_date',\n", + " 'subject', 'content_type', 'institution', 'metadata_policy',\n", + " 'data_policy', 'submission_policy', 'content_policy', 'software',\n", + " 'api'],\n", + " dtype='object')" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def empty_list_is_nan(cell):\n", + " if isinstance(cell, list):\n", + " return np.nan if len(cell) == 0 else cell\n", + " else:\n", + " return cell\n", + " \n", + "opendoar_df = opendoar_df.applymap(empty_list_is_nan)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -330,14 +370,14 @@ " 5707\n", " 5707.000000\n", " 5707\n", - " 5707\n", + " 2138\n", " 5707\n", " 5425\n", " 5707\n", " 5707\n", " 5707\n", - " 5707\n", - " 5707\n", + " 5542\n", + " 5563\n", " 5707\n", " 5707\n", " 5707\n", @@ -351,14 +391,14 @@ " 5707\n", " NaN\n", " 5670\n", - " 2097\n", + " 2096\n", " 5670\n", " 4622\n", " 4\n", " 2501\n", " 5538\n", - " 820\n", - " 477\n", + " 819\n", + " 476\n", " 5098\n", " 2\n", " 2\n", @@ -393,7 +433,7 @@ " 1\n", " NaN\n", " 3\n", - " 3569\n", + " 4\n", " 3\n", " 95\n", " 5067\n", @@ -575,10 +615,10 @@ "max NaN 10175.000000 \n", "\n", " repository_name additional_name \\\n", - "count 5707 5707 \n", - "unique 5670 2097 \n", + "count 5707 2138 \n", + "unique 5670 2096 \n", "top hiroshima associated repository portal [] \n", - "freq 3 3569 \n", + "freq 3 4 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", @@ -614,8 +654,8 @@ "max NaN NaN \n", "\n", " update_date start_date subject \\\n", - "count 5707 5707 5707 \n", - "unique 2501 5538 820 \n", + "count 5707 5707 5542 \n", + "unique 2501 5538 819 \n", "top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n", "freq 82 82 3212 \n", "mean NaN NaN NaN \n", @@ -627,8 +667,8 @@ "max NaN NaN NaN \n", "\n", " content_type \\\n", - "count 5707 \n", - "unique 477 \n", + "count 5563 \n", + "unique 476 \n", "top [theses_and_dissertations] \n", "freq 460 \n", "mean NaN \n", @@ -666,7 +706,7 @@ "max NaN NaN NaN NaN NaN " ] }, - "execution_count": 4, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -677,34 +717,34 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "openaire_id 0\n", - "opendoar_id 0\n", - "repository_name 0\n", - "additional_name 0\n", - "repository_url 0\n", - "description 282\n", - "type 0\n", - "update_date 0\n", - "start_date 0\n", - "subject 0\n", - "content_type 0\n", - "institution 0\n", - "metadata_policy 0\n", - "data_policy 0\n", - "submission_policy 0\n", - "content_policy 0\n", - "software 0\n", - "api 0\n", + "openaire_id 0\n", + "opendoar_id 0\n", + "repository_name 0\n", + "additional_name 3569\n", + "repository_url 0\n", + "description 282\n", + "type 0\n", + "update_date 0\n", + "start_date 0\n", + "subject 165\n", + "content_type 144\n", + "institution 0\n", + "metadata_policy 0\n", + "data_policy 0\n", + "submission_policy 0\n", + "content_policy 0\n", + "software 0\n", + "api 0\n", "dtype: int64" ] }, - "execution_count": 5, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -713,6 +753,13 @@ "opendoar_df.isna().sum()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null,