each registry has a basic analysis

master
Andrea Mannocci 3 years ago
parent 434fe5ed20
commit dd6b79e69f

@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@ -452,7 +452,7 @@
"4 2021-06-11 "
]
},
"execution_count": 20,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@ -473,7 +473,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -492,7 +492,7 @@
" dtype='object')"
]
},
"execution_count": 14,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@ -503,7 +503,22 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"re3data_df = re3data_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
@ -574,21 +589,21 @@
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2137</td>\n",
" <td>2686</td>\n",
" <td>829</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2677</td>\n",
" <td>1260</td>\n",
" <td>1248</td>\n",
" <td>1762</td>\n",
" <td>146</td>\n",
" <td>2685</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2700</td>\n",
" <td>2699</td>\n",
" <td>2699</td>\n",
" <td>2706</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n",
@ -616,21 +631,21 @@
" <td>2707</td>\n",
" <td>2707</td>\n",
" <td>2704</td>\n",
" <td>2129</td>\n",
" <td>2128</td>\n",
" <td>2683</td>\n",
" <td>829</td>\n",
" <td>828</td>\n",
" <td>2705</td>\n",
" <td>9</td>\n",
" <td>8</td>\n",
" <td>1233</td>\n",
" <td>687</td>\n",
" <td>351</td>\n",
" <td>79</td>\n",
" <td>1368</td>\n",
" <td>1367</td>\n",
" <td>2</td>\n",
" <td>1324</td>\n",
" <td>5</td>\n",
" <td>2475</td>\n",
" <td>2686</td>\n",
" <td>1323</td>\n",
" <td>4</td>\n",
" <td>2474</td>\n",
" <td>2685</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
@ -655,12 +670,12 @@
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>re3data_____::d8e2164dd005d3961c23e0762453cfb1</td>\n",
" <td>r3d100010836</td>\n",
" <td>UCLA Social Science Data Archive Dataverse</td>\n",
" <td>[]</td>\n",
" <td>re3data_____::4cea5a5ea78542232a51190879756661</td>\n",
" <td>r3d100011254</td>\n",
" <td>EarthChem Library</td>\n",
" <td>[IRIS]</td>\n",
" <td>http://www.jcvi.org/cms/home/</td>\n",
" <td>[]</td>\n",
" <td>[doi:10.17171/1-6]</td>\n",
" <td>The repository is no longer available. &gt;&gt;&gt;!!!&lt;...</td>\n",
" <td>[disciplinary]</td>\n",
" <td>2 datasets</td>\n",
@ -700,9 +715,9 @@
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>570</td>\n",
" <td>2</td>\n",
" <td>1878</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1713</td>\n",
" <td>6</td>\n",
@ -745,48 +760,42 @@
" openaire_id re3data_id \\\n",
"count 2707 2707 \n",
"unique 2707 2707 \n",
"top re3data_____::d8e2164dd005d3961c23e0762453cfb1 r3d100010836 \n",
"top re3data_____::4cea5a5ea78542232a51190879756661 r3d100011254 \n",
"freq 1 1 \n",
"\n",
" repository_name additional_name \\\n",
"count 2707 2707 \n",
"unique 2704 2129 \n",
"top UCLA Social Science Data Archive Dataverse [] \n",
"freq 2 570 \n",
"\n",
" repository_url repository_id \\\n",
"count 2686 2707 \n",
"unique 2683 829 \n",
"top http://www.jcvi.org/cms/home/ [] \n",
"freq 2 1878 \n",
"\n",
" description type \\\n",
"count 2707 2707 \n",
"unique 2705 9 \n",
"top The repository is no longer available. >>>!!!<... [disciplinary] \n",
"freq 2 1713 \n",
"\n",
" size update_date start_date end_date \\\n",
"count 1260 1248 1762 146 \n",
"unique 1233 687 351 79 \n",
"top 2 datasets 2019-05-15 2008 2015 \n",
"freq 6 15 92 11 \n",
" repository_name additional_name repository_url \\\n",
"count 2707 2137 2686 \n",
"unique 2704 2128 2683 \n",
"top EarthChem Library [IRIS] http://www.jcvi.org/cms/home/ \n",
"freq 2 2 2 \n",
"\n",
" repository_id description \\\n",
"count 829 2707 \n",
"unique 828 2705 \n",
"top [doi:10.17171/1-6] The repository is no longer available. >>>!!!<... \n",
"freq 2 2 \n",
"\n",
" type size update_date start_date end_date \\\n",
"count 2677 1260 1248 1762 146 \n",
"unique 8 1233 687 351 79 \n",
"top [disciplinary] 2 datasets 2019-05-15 2008 2015 \n",
"freq 1713 6 15 92 11 \n",
"\n",
" subject mission_statement \\\n",
"count 2707 2707 \n",
"unique 1368 2 \n",
"count 2685 2707 \n",
"unique 1367 2 \n",
"top [1 Humanities and Social Sciences, 2 Life Scie... true \n",
"freq 222 2286 \n",
"\n",
" content_type provider_type keyword \\\n",
"count 2707 2707 2707 \n",
"unique 1324 5 2475 \n",
"count 2700 2699 2699 \n",
"unique 1323 4 2474 \n",
"top [Standard office documents] [dataProvider] [multidisciplinary] \n",
"freq 30 1748 190 \n",
"\n",
" institution policy \\\n",
"count 2707 2707 \n",
"unique 2686 2 \n",
"count 2706 2707 \n",
"unique 2685 2 \n",
"top [[National Center for Biotechnology Informatio... true \n",
"freq 6 2394 \n",
"\n",
@ -827,7 +836,7 @@
"freq 47 "
]
},
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -838,7 +847,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -847,21 +856,21 @@
"openaire_id 0\n",
"re3data_id 0\n",
"repository_name 0\n",
"additional_name 0\n",
"additional_name 570\n",
"repository_url 21\n",
"repository_id 0\n",
"repository_id 1878\n",
"description 0\n",
"type 0\n",
"type 30\n",
"size 1447\n",
"update_date 1459\n",
"start_date 945\n",
"end_date 2561\n",
"subject 0\n",
"subject 22\n",
"mission_statement 0\n",
"content_type 0\n",
"provider_type 0\n",
"keyword 0\n",
"institution 0\n",
"content_type 7\n",
"provider_type 8\n",
"keyword 8\n",
"institution 1\n",
"policy 0\n",
"database_access 0\n",
"database_license 0\n",
@ -886,7 +895,7 @@
"dtype: int64"
]
},
"execution_count": 10,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -897,7 +906,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -911,7 +920,7 @@
" 'Configuration data', 'Networkbased data', nan], dtype=object)"
]
},
"execution_count": 18,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -922,7 +931,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -931,7 +940,7 @@
"array(['dataProvider', 'serviceProvider', nan], dtype=object)"
]
},
"execution_count": 19,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -939,13 +948,6 @@
"source": [
"re3data_df.provider_type.explode().unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

@ -36,7 +36,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 24,
"metadata": {},
"outputs": [
{
@ -262,7 +262,7 @@
"4 true "
]
},
"execution_count": 3,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@ -280,7 +280,47 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n",
" 'repository_url', 'description', 'type', 'update_date', 'start_date',\n",
" 'subject', 'content_type', 'institution', 'metadata_policy',\n",
" 'data_policy', 'submission_policy', 'content_policy', 'software',\n",
" 'api'],\n",
" dtype='object')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
@ -330,14 +370,14 @@
" <td>5707</td>\n",
" <td>5707.000000</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>2138</td>\n",
" <td>5707</td>\n",
" <td>5425</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5542</td>\n",
" <td>5563</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
@ -351,14 +391,14 @@
" <td>5707</td>\n",
" <td>NaN</td>\n",
" <td>5670</td>\n",
" <td>2097</td>\n",
" <td>2096</td>\n",
" <td>5670</td>\n",
" <td>4622</td>\n",
" <td>4</td>\n",
" <td>2501</td>\n",
" <td>5538</td>\n",
" <td>820</td>\n",
" <td>477</td>\n",
" <td>819</td>\n",
" <td>476</td>\n",
" <td>5098</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
@ -393,7 +433,7 @@
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>3569</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>95</td>\n",
" <td>5067</td>\n",
@ -575,10 +615,10 @@
"max NaN 10175.000000 \n",
"\n",
" repository_name additional_name \\\n",
"count 5707 5707 \n",
"unique 5670 2097 \n",
"count 5707 2138 \n",
"unique 5670 2096 \n",
"top hiroshima associated repository portal [] \n",
"freq 3 3569 \n",
"freq 3 4 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
@ -614,8 +654,8 @@
"max NaN NaN \n",
"\n",
" update_date start_date subject \\\n",
"count 5707 5707 5707 \n",
"unique 2501 5538 820 \n",
"count 5707 5707 5542 \n",
"unique 2501 5538 819 \n",
"top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n",
"freq 82 82 3212 \n",
"mean NaN NaN NaN \n",
@ -627,8 +667,8 @@
"max NaN NaN NaN \n",
"\n",
" content_type \\\n",
"count 5707 \n",
"unique 477 \n",
"count 5563 \n",
"unique 476 \n",
"top [theses_and_dissertations] \n",
"freq 460 \n",
"mean NaN \n",
@ -666,7 +706,7 @@
"max NaN NaN NaN NaN NaN "
]
},
"execution_count": 4,
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@ -677,34 +717,34 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"openaire_id 0\n",
"opendoar_id 0\n",
"repository_name 0\n",
"additional_name 0\n",
"repository_url 0\n",
"description 282\n",
"type 0\n",
"update_date 0\n",
"start_date 0\n",
"subject 0\n",
"content_type 0\n",
"institution 0\n",
"metadata_policy 0\n",
"data_policy 0\n",
"submission_policy 0\n",
"content_policy 0\n",
"software 0\n",
"api 0\n",
"openaire_id 0\n",
"opendoar_id 0\n",
"repository_name 0\n",
"additional_name 3569\n",
"repository_url 0\n",
"description 282\n",
"type 0\n",
"update_date 0\n",
"start_date 0\n",
"subject 165\n",
"content_type 144\n",
"institution 0\n",
"metadata_policy 0\n",
"data_policy 0\n",
"submission_policy 0\n",
"content_policy 0\n",
"software 0\n",
"api 0\n",
"dtype: int64"
]
},
"execution_count": 5,
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
@ -713,6 +753,13 @@
"opendoar_df.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,

Loading…
Cancel
Save