each registry has a basic analysis
This commit is contained in:
parent
434fe5ed20
commit
dd6b79e69f
|
@ -51,7 +51,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -452,7 +452,7 @@
|
|||
"4 2021-06-11 "
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -473,7 +473,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -492,7 +492,7 @@
|
|||
" dtype='object')"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -503,7 +503,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def empty_list_is_nan(cell):\n",
|
||||
" if isinstance(cell, list):\n",
|
||||
" return np.nan if len(cell) == 0 else cell\n",
|
||||
" else:\n",
|
||||
" return cell\n",
|
||||
" \n",
|
||||
"re3data_df = re3data_df.applymap(empty_list_is_nan)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -574,21 +589,21 @@
|
|||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2137</td>\n",
|
||||
" <td>2686</td>\n",
|
||||
" <td>829</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2677</td>\n",
|
||||
" <td>1260</td>\n",
|
||||
" <td>1248</td>\n",
|
||||
" <td>1762</td>\n",
|
||||
" <td>146</td>\n",
|
||||
" <td>2685</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2700</td>\n",
|
||||
" <td>2699</td>\n",
|
||||
" <td>2699</td>\n",
|
||||
" <td>2706</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
|
@ -616,21 +631,21 @@
|
|||
" <td>2707</td>\n",
|
||||
" <td>2707</td>\n",
|
||||
" <td>2704</td>\n",
|
||||
" <td>2129</td>\n",
|
||||
" <td>2128</td>\n",
|
||||
" <td>2683</td>\n",
|
||||
" <td>829</td>\n",
|
||||
" <td>828</td>\n",
|
||||
" <td>2705</td>\n",
|
||||
" <td>9</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>1233</td>\n",
|
||||
" <td>687</td>\n",
|
||||
" <td>351</td>\n",
|
||||
" <td>79</td>\n",
|
||||
" <td>1368</td>\n",
|
||||
" <td>1367</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1324</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>2475</td>\n",
|
||||
" <td>2686</td>\n",
|
||||
" <td>1323</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>2474</td>\n",
|
||||
" <td>2685</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>2</td>\n",
|
||||
|
@ -655,12 +670,12 @@
|
|||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>top</th>\n",
|
||||
" <td>re3data_____::d8e2164dd005d3961c23e0762453cfb1</td>\n",
|
||||
" <td>r3d100010836</td>\n",
|
||||
" <td>UCLA Social Science Data Archive Dataverse</td>\n",
|
||||
" <td>[]</td>\n",
|
||||
" <td>re3data_____::4cea5a5ea78542232a51190879756661</td>\n",
|
||||
" <td>r3d100011254</td>\n",
|
||||
" <td>EarthChem Library</td>\n",
|
||||
" <td>[IRIS]</td>\n",
|
||||
" <td>http://www.jcvi.org/cms/home/</td>\n",
|
||||
" <td>[]</td>\n",
|
||||
" <td>[doi:10.17171/1-6]</td>\n",
|
||||
" <td>The repository is no longer available. >>>!!!<...</td>\n",
|
||||
" <td>[disciplinary]</td>\n",
|
||||
" <td>2 datasets</td>\n",
|
||||
|
@ -700,9 +715,9 @@
|
|||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>570</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1878</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1713</td>\n",
|
||||
" <td>6</td>\n",
|
||||
|
@ -745,48 +760,42 @@
|
|||
" openaire_id re3data_id \\\n",
|
||||
"count 2707 2707 \n",
|
||||
"unique 2707 2707 \n",
|
||||
"top re3data_____::d8e2164dd005d3961c23e0762453cfb1 r3d100010836 \n",
|
||||
"top re3data_____::4cea5a5ea78542232a51190879756661 r3d100011254 \n",
|
||||
"freq 1 1 \n",
|
||||
"\n",
|
||||
" repository_name additional_name \\\n",
|
||||
"count 2707 2707 \n",
|
||||
"unique 2704 2129 \n",
|
||||
"top UCLA Social Science Data Archive Dataverse [] \n",
|
||||
"freq 2 570 \n",
|
||||
" repository_name additional_name repository_url \\\n",
|
||||
"count 2707 2137 2686 \n",
|
||||
"unique 2704 2128 2683 \n",
|
||||
"top EarthChem Library [IRIS] http://www.jcvi.org/cms/home/ \n",
|
||||
"freq 2 2 2 \n",
|
||||
"\n",
|
||||
" repository_url repository_id \\\n",
|
||||
"count 2686 2707 \n",
|
||||
"unique 2683 829 \n",
|
||||
"top http://www.jcvi.org/cms/home/ [] \n",
|
||||
"freq 2 1878 \n",
|
||||
" repository_id description \\\n",
|
||||
"count 829 2707 \n",
|
||||
"unique 828 2705 \n",
|
||||
"top [doi:10.17171/1-6] The repository is no longer available. >>>!!!<... \n",
|
||||
"freq 2 2 \n",
|
||||
"\n",
|
||||
" description type \\\n",
|
||||
"count 2707 2707 \n",
|
||||
"unique 2705 9 \n",
|
||||
"top The repository is no longer available. >>>!!!<... [disciplinary] \n",
|
||||
"freq 2 1713 \n",
|
||||
"\n",
|
||||
" size update_date start_date end_date \\\n",
|
||||
"count 1260 1248 1762 146 \n",
|
||||
"unique 1233 687 351 79 \n",
|
||||
"top 2 datasets 2019-05-15 2008 2015 \n",
|
||||
"freq 6 15 92 11 \n",
|
||||
" type size update_date start_date end_date \\\n",
|
||||
"count 2677 1260 1248 1762 146 \n",
|
||||
"unique 8 1233 687 351 79 \n",
|
||||
"top [disciplinary] 2 datasets 2019-05-15 2008 2015 \n",
|
||||
"freq 1713 6 15 92 11 \n",
|
||||
"\n",
|
||||
" subject mission_statement \\\n",
|
||||
"count 2707 2707 \n",
|
||||
"unique 1368 2 \n",
|
||||
"count 2685 2707 \n",
|
||||
"unique 1367 2 \n",
|
||||
"top [1 Humanities and Social Sciences, 2 Life Scie... true \n",
|
||||
"freq 222 2286 \n",
|
||||
"\n",
|
||||
" content_type provider_type keyword \\\n",
|
||||
"count 2707 2707 2707 \n",
|
||||
"unique 1324 5 2475 \n",
|
||||
"count 2700 2699 2699 \n",
|
||||
"unique 1323 4 2474 \n",
|
||||
"top [Standard office documents] [dataProvider] [multidisciplinary] \n",
|
||||
"freq 30 1748 190 \n",
|
||||
"\n",
|
||||
" institution policy \\\n",
|
||||
"count 2707 2707 \n",
|
||||
"unique 2686 2 \n",
|
||||
"count 2706 2707 \n",
|
||||
"unique 2685 2 \n",
|
||||
"top [[National Center for Biotechnology Informatio... true \n",
|
||||
"freq 6 2394 \n",
|
||||
"\n",
|
||||
|
@ -827,7 +836,7 @@
|
|||
"freq 47 "
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -838,7 +847,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -847,21 +856,21 @@
|
|||
"openaire_id 0\n",
|
||||
"re3data_id 0\n",
|
||||
"repository_name 0\n",
|
||||
"additional_name 0\n",
|
||||
"additional_name 570\n",
|
||||
"repository_url 21\n",
|
||||
"repository_id 0\n",
|
||||
"repository_id 1878\n",
|
||||
"description 0\n",
|
||||
"type 0\n",
|
||||
"type 30\n",
|
||||
"size 1447\n",
|
||||
"update_date 1459\n",
|
||||
"start_date 945\n",
|
||||
"end_date 2561\n",
|
||||
"subject 0\n",
|
||||
"subject 22\n",
|
||||
"mission_statement 0\n",
|
||||
"content_type 0\n",
|
||||
"provider_type 0\n",
|
||||
"keyword 0\n",
|
||||
"institution 0\n",
|
||||
"content_type 7\n",
|
||||
"provider_type 8\n",
|
||||
"keyword 8\n",
|
||||
"institution 1\n",
|
||||
"policy 0\n",
|
||||
"database_access 0\n",
|
||||
"database_license 0\n",
|
||||
|
@ -886,7 +895,7 @@
|
|||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -897,7 +906,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -911,7 +920,7 @@
|
|||
" 'Configuration data', 'Networkbased data', nan], dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -922,7 +931,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -931,7 +940,7 @@
|
|||
"array(['dataProvider', 'serviceProvider', nan], dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -939,13 +948,6 @@
|
|||
"source": [
|
||||
"re3data_df.provider_type.explode().unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -262,7 +262,7 @@
|
|||
"4 true "
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -280,7 +280,47 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n",
|
||||
" 'repository_url', 'description', 'type', 'update_date', 'start_date',\n",
|
||||
" 'subject', 'content_type', 'institution', 'metadata_policy',\n",
|
||||
" 'data_policy', 'submission_policy', 'content_policy', 'software',\n",
|
||||
" 'api'],\n",
|
||||
" dtype='object')"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"opendoar_df.columns"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def empty_list_is_nan(cell):\n",
|
||||
" if isinstance(cell, list):\n",
|
||||
" return np.nan if len(cell) == 0 else cell\n",
|
||||
" else:\n",
|
||||
" return cell\n",
|
||||
" \n",
|
||||
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -330,14 +370,14 @@
|
|||
" <td>5707</td>\n",
|
||||
" <td>5707.000000</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>2138</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5425</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5542</td>\n",
|
||||
" <td>5563</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
" <td>5707</td>\n",
|
||||
|
@ -351,14 +391,14 @@
|
|||
" <td>5707</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>5670</td>\n",
|
||||
" <td>2097</td>\n",
|
||||
" <td>2096</td>\n",
|
||||
" <td>5670</td>\n",
|
||||
" <td>4622</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>2501</td>\n",
|
||||
" <td>5538</td>\n",
|
||||
" <td>820</td>\n",
|
||||
" <td>477</td>\n",
|
||||
" <td>819</td>\n",
|
||||
" <td>476</td>\n",
|
||||
" <td>5098</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
|
@ -393,7 +433,7 @@
|
|||
" <td>1</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>3569</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>95</td>\n",
|
||||
" <td>5067</td>\n",
|
||||
|
@ -575,10 +615,10 @@
|
|||
"max NaN 10175.000000 \n",
|
||||
"\n",
|
||||
" repository_name additional_name \\\n",
|
||||
"count 5707 5707 \n",
|
||||
"unique 5670 2097 \n",
|
||||
"count 5707 2138 \n",
|
||||
"unique 5670 2096 \n",
|
||||
"top hiroshima associated repository portal [] \n",
|
||||
"freq 3 3569 \n",
|
||||
"freq 3 4 \n",
|
||||
"mean NaN NaN \n",
|
||||
"std NaN NaN \n",
|
||||
"min NaN NaN \n",
|
||||
|
@ -614,8 +654,8 @@
|
|||
"max NaN NaN \n",
|
||||
"\n",
|
||||
" update_date start_date subject \\\n",
|
||||
"count 5707 5707 5707 \n",
|
||||
"unique 2501 5538 820 \n",
|
||||
"count 5707 5707 5542 \n",
|
||||
"unique 2501 5538 819 \n",
|
||||
"top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n",
|
||||
"freq 82 82 3212 \n",
|
||||
"mean NaN NaN NaN \n",
|
||||
|
@ -627,8 +667,8 @@
|
|||
"max NaN NaN NaN \n",
|
||||
"\n",
|
||||
" content_type \\\n",
|
||||
"count 5707 \n",
|
||||
"unique 477 \n",
|
||||
"count 5563 \n",
|
||||
"unique 476 \n",
|
||||
"top [theses_and_dissertations] \n",
|
||||
"freq 460 \n",
|
||||
"mean NaN \n",
|
||||
|
@ -666,7 +706,7 @@
|
|||
"max NaN NaN NaN NaN NaN "
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -677,34 +717,34 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"openaire_id 0\n",
|
||||
"opendoar_id 0\n",
|
||||
"repository_name 0\n",
|
||||
"additional_name 0\n",
|
||||
"repository_url 0\n",
|
||||
"description 282\n",
|
||||
"type 0\n",
|
||||
"update_date 0\n",
|
||||
"start_date 0\n",
|
||||
"subject 0\n",
|
||||
"content_type 0\n",
|
||||
"institution 0\n",
|
||||
"metadata_policy 0\n",
|
||||
"data_policy 0\n",
|
||||
"submission_policy 0\n",
|
||||
"content_policy 0\n",
|
||||
"software 0\n",
|
||||
"api 0\n",
|
||||
"openaire_id 0\n",
|
||||
"opendoar_id 0\n",
|
||||
"repository_name 0\n",
|
||||
"additional_name 3569\n",
|
||||
"repository_url 0\n",
|
||||
"description 282\n",
|
||||
"type 0\n",
|
||||
"update_date 0\n",
|
||||
"start_date 0\n",
|
||||
"subject 165\n",
|
||||
"content_type 144\n",
|
||||
"institution 0\n",
|
||||
"metadata_policy 0\n",
|
||||
"data_policy 0\n",
|
||||
"submission_policy 0\n",
|
||||
"content_policy 0\n",
|
||||
"software 0\n",
|
||||
"api 0\n",
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -713,6 +753,13 @@
|
|||
"opendoar_df.isna().sum()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
Loading…
Reference in New Issue