each registry has a basic analysis

This commit is contained in:
Andrea Mannocci 2021-07-23 15:28:23 +02:00
parent 434fe5ed20
commit dd6b79e69f
2 changed files with 165 additions and 116 deletions

View File

@ -51,7 +51,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -452,7 +452,7 @@
"4 2021-06-11 " "4 2021-06-11 "
] ]
}, },
"execution_count": 20, "execution_count": 2,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -473,7 +473,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -492,7 +492,7 @@
" dtype='object')" " dtype='object')"
] ]
}, },
"execution_count": 14, "execution_count": 3,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -503,7 +503,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"re3data_df = re3data_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -574,21 +589,21 @@
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2707</td>\n", " <td>2137</td>\n",
" <td>2686</td>\n", " <td>2686</td>\n",
" <td>829</td>\n",
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2707</td>\n", " <td>2677</td>\n",
" <td>2707</td>\n",
" <td>1260</td>\n", " <td>1260</td>\n",
" <td>1248</td>\n", " <td>1248</td>\n",
" <td>1762</td>\n", " <td>1762</td>\n",
" <td>146</td>\n", " <td>146</td>\n",
" <td>2685</td>\n",
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2707</td>\n", " <td>2700</td>\n",
" <td>2707</td>\n", " <td>2699</td>\n",
" <td>2707</td>\n", " <td>2699</td>\n",
" <td>2707</td>\n", " <td>2706</td>\n",
" <td>2707</td>\n",
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2707</td>\n", " <td>2707</td>\n",
@ -616,21 +631,21 @@
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2707</td>\n", " <td>2707</td>\n",
" <td>2704</td>\n", " <td>2704</td>\n",
" <td>2129</td>\n", " <td>2128</td>\n",
" <td>2683</td>\n", " <td>2683</td>\n",
" <td>829</td>\n", " <td>828</td>\n",
" <td>2705</td>\n", " <td>2705</td>\n",
" <td>9</td>\n", " <td>8</td>\n",
" <td>1233</td>\n", " <td>1233</td>\n",
" <td>687</td>\n", " <td>687</td>\n",
" <td>351</td>\n", " <td>351</td>\n",
" <td>79</td>\n", " <td>79</td>\n",
" <td>1368</td>\n", " <td>1367</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>1324</td>\n", " <td>1323</td>\n",
" <td>5</td>\n", " <td>4</td>\n",
" <td>2475</td>\n", " <td>2474</td>\n",
" <td>2686</td>\n", " <td>2685</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>1</td>\n", " <td>1</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
@ -655,12 +670,12 @@
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>top</th>\n", " <th>top</th>\n",
" <td>re3data_____::d8e2164dd005d3961c23e0762453cfb1</td>\n", " <td>re3data_____::4cea5a5ea78542232a51190879756661</td>\n",
" <td>r3d100010836</td>\n", " <td>r3d100011254</td>\n",
" <td>UCLA Social Science Data Archive Dataverse</td>\n", " <td>EarthChem Library</td>\n",
" <td>[]</td>\n", " <td>[IRIS]</td>\n",
" <td>http://www.jcvi.org/cms/home/</td>\n", " <td>http://www.jcvi.org/cms/home/</td>\n",
" <td>[]</td>\n", " <td>[doi:10.17171/1-6]</td>\n",
" <td>The repository is no longer available. &gt;&gt;&gt;!!!&lt;...</td>\n", " <td>The repository is no longer available. &gt;&gt;&gt;!!!&lt;...</td>\n",
" <td>[disciplinary]</td>\n", " <td>[disciplinary]</td>\n",
" <td>2 datasets</td>\n", " <td>2 datasets</td>\n",
@ -700,9 +715,9 @@
" <td>1</td>\n", " <td>1</td>\n",
" <td>1</td>\n", " <td>1</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>570</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>1878</td>\n", " <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>1713</td>\n", " <td>1713</td>\n",
" <td>6</td>\n", " <td>6</td>\n",
@ -745,48 +760,42 @@
" openaire_id re3data_id \\\n", " openaire_id re3data_id \\\n",
"count 2707 2707 \n", "count 2707 2707 \n",
"unique 2707 2707 \n", "unique 2707 2707 \n",
"top re3data_____::d8e2164dd005d3961c23e0762453cfb1 r3d100010836 \n", "top re3data_____::4cea5a5ea78542232a51190879756661 r3d100011254 \n",
"freq 1 1 \n", "freq 1 1 \n",
"\n", "\n",
" repository_name additional_name \\\n", " repository_name additional_name repository_url \\\n",
"count 2707 2707 \n", "count 2707 2137 2686 \n",
"unique 2704 2129 \n", "unique 2704 2128 2683 \n",
"top UCLA Social Science Data Archive Dataverse [] \n", "top EarthChem Library [IRIS] http://www.jcvi.org/cms/home/ \n",
"freq 2 570 \n", "freq 2 2 2 \n",
"\n", "\n",
" repository_url repository_id \\\n", " repository_id description \\\n",
"count 2686 2707 \n", "count 829 2707 \n",
"unique 2683 829 \n", "unique 828 2705 \n",
"top http://www.jcvi.org/cms/home/ [] \n", "top [doi:10.17171/1-6] The repository is no longer available. >>>!!!<... \n",
"freq 2 1878 \n", "freq 2 2 \n",
"\n", "\n",
" description type \\\n", " type size update_date start_date end_date \\\n",
"count 2707 2707 \n", "count 2677 1260 1248 1762 146 \n",
"unique 2705 9 \n", "unique 8 1233 687 351 79 \n",
"top The repository is no longer available. >>>!!!<... [disciplinary] \n", "top [disciplinary] 2 datasets 2019-05-15 2008 2015 \n",
"freq 2 1713 \n", "freq 1713 6 15 92 11 \n",
"\n",
" size update_date start_date end_date \\\n",
"count 1260 1248 1762 146 \n",
"unique 1233 687 351 79 \n",
"top 2 datasets 2019-05-15 2008 2015 \n",
"freq 6 15 92 11 \n",
"\n", "\n",
" subject mission_statement \\\n", " subject mission_statement \\\n",
"count 2707 2707 \n", "count 2685 2707 \n",
"unique 1368 2 \n", "unique 1367 2 \n",
"top [1 Humanities and Social Sciences, 2 Life Scie... true \n", "top [1 Humanities and Social Sciences, 2 Life Scie... true \n",
"freq 222 2286 \n", "freq 222 2286 \n",
"\n", "\n",
" content_type provider_type keyword \\\n", " content_type provider_type keyword \\\n",
"count 2707 2707 2707 \n", "count 2700 2699 2699 \n",
"unique 1324 5 2475 \n", "unique 1323 4 2474 \n",
"top [Standard office documents] [dataProvider] [multidisciplinary] \n", "top [Standard office documents] [dataProvider] [multidisciplinary] \n",
"freq 30 1748 190 \n", "freq 30 1748 190 \n",
"\n", "\n",
" institution policy \\\n", " institution policy \\\n",
"count 2707 2707 \n", "count 2706 2707 \n",
"unique 2686 2 \n", "unique 2685 2 \n",
"top [[National Center for Biotechnology Informatio... true \n", "top [[National Center for Biotechnology Informatio... true \n",
"freq 6 2394 \n", "freq 6 2394 \n",
"\n", "\n",
@ -827,7 +836,7 @@
"freq 47 " "freq 47 "
] ]
}, },
"execution_count": 3, "execution_count": 5,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -838,7 +847,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -847,21 +856,21 @@
"openaire_id 0\n", "openaire_id 0\n",
"re3data_id 0\n", "re3data_id 0\n",
"repository_name 0\n", "repository_name 0\n",
"additional_name 0\n", "additional_name 570\n",
"repository_url 21\n", "repository_url 21\n",
"repository_id 0\n", "repository_id 1878\n",
"description 0\n", "description 0\n",
"type 0\n", "type 30\n",
"size 1447\n", "size 1447\n",
"update_date 1459\n", "update_date 1459\n",
"start_date 945\n", "start_date 945\n",
"end_date 2561\n", "end_date 2561\n",
"subject 0\n", "subject 22\n",
"mission_statement 0\n", "mission_statement 0\n",
"content_type 0\n", "content_type 7\n",
"provider_type 0\n", "provider_type 8\n",
"keyword 0\n", "keyword 8\n",
"institution 0\n", "institution 1\n",
"policy 0\n", "policy 0\n",
"database_access 0\n", "database_access 0\n",
"database_license 0\n", "database_license 0\n",
@ -886,7 +895,7 @@
"dtype: int64" "dtype: int64"
] ]
}, },
"execution_count": 10, "execution_count": 6,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -897,7 +906,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -911,7 +920,7 @@
" 'Configuration data', 'Networkbased data', nan], dtype=object)" " 'Configuration data', 'Networkbased data', nan], dtype=object)"
] ]
}, },
"execution_count": 18, "execution_count": 7,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -922,7 +931,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -931,7 +940,7 @@
"array(['dataProvider', 'serviceProvider', nan], dtype=object)" "array(['dataProvider', 'serviceProvider', nan], dtype=object)"
] ]
}, },
"execution_count": 19, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -939,13 +948,6 @@
"source": [ "source": [
"re3data_df.provider_type.explode().unique()" "re3data_df.provider_type.explode().unique()"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {

View File

@ -36,7 +36,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 24,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -262,7 +262,7 @@
"4 true " "4 true "
] ]
}, },
"execution_count": 3, "execution_count": 24,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -280,7 +280,47 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n",
" 'repository_url', 'description', 'type', 'update_date', 'start_date',\n",
" 'subject', 'content_type', 'institution', 'metadata_policy',\n",
" 'data_policy', 'submission_policy', 'content_policy', 'software',\n",
" 'api'],\n",
" dtype='object')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -330,14 +370,14 @@
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>5707.000000</td>\n", " <td>5707.000000</td>\n",
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>5707</td>\n", " <td>2138</td>\n",
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>5425</td>\n", " <td>5425</td>\n",
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>5707</td>\n", " <td>5542</td>\n",
" <td>5707</td>\n", " <td>5563</td>\n",
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>5707</td>\n", " <td>5707</td>\n",
@ -351,14 +391,14 @@
" <td>5707</td>\n", " <td>5707</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>5670</td>\n", " <td>5670</td>\n",
" <td>2097</td>\n", " <td>2096</td>\n",
" <td>5670</td>\n", " <td>5670</td>\n",
" <td>4622</td>\n", " <td>4622</td>\n",
" <td>4</td>\n", " <td>4</td>\n",
" <td>2501</td>\n", " <td>2501</td>\n",
" <td>5538</td>\n", " <td>5538</td>\n",
" <td>820</td>\n", " <td>819</td>\n",
" <td>477</td>\n", " <td>476</td>\n",
" <td>5098</td>\n", " <td>5098</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
" <td>2</td>\n", " <td>2</td>\n",
@ -393,7 +433,7 @@
" <td>1</td>\n", " <td>1</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>3</td>\n", " <td>3</td>\n",
" <td>3569</td>\n", " <td>4</td>\n",
" <td>3</td>\n", " <td>3</td>\n",
" <td>95</td>\n", " <td>95</td>\n",
" <td>5067</td>\n", " <td>5067</td>\n",
@ -575,10 +615,10 @@
"max NaN 10175.000000 \n", "max NaN 10175.000000 \n",
"\n", "\n",
" repository_name additional_name \\\n", " repository_name additional_name \\\n",
"count 5707 5707 \n", "count 5707 2138 \n",
"unique 5670 2097 \n", "unique 5670 2096 \n",
"top hiroshima associated repository portal [] \n", "top hiroshima associated repository portal [] \n",
"freq 3 3569 \n", "freq 3 4 \n",
"mean NaN NaN \n", "mean NaN NaN \n",
"std NaN NaN \n", "std NaN NaN \n",
"min NaN NaN \n", "min NaN NaN \n",
@ -614,8 +654,8 @@
"max NaN NaN \n", "max NaN NaN \n",
"\n", "\n",
" update_date start_date subject \\\n", " update_date start_date subject \\\n",
"count 5707 5707 5707 \n", "count 5707 5707 5542 \n",
"unique 2501 5538 820 \n", "unique 2501 5538 819 \n",
"top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n", "top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n",
"freq 82 82 3212 \n", "freq 82 82 3212 \n",
"mean NaN NaN NaN \n", "mean NaN NaN NaN \n",
@ -627,8 +667,8 @@
"max NaN NaN NaN \n", "max NaN NaN NaN \n",
"\n", "\n",
" content_type \\\n", " content_type \\\n",
"count 5707 \n", "count 5563 \n",
"unique 477 \n", "unique 476 \n",
"top [theses_and_dissertations] \n", "top [theses_and_dissertations] \n",
"freq 460 \n", "freq 460 \n",
"mean NaN \n", "mean NaN \n",
@ -666,7 +706,7 @@
"max NaN NaN NaN NaN NaN " "max NaN NaN NaN NaN NaN "
] ]
}, },
"execution_count": 4, "execution_count": 29,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -677,34 +717,34 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 30,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"openaire_id 0\n", "openaire_id 0\n",
"opendoar_id 0\n", "opendoar_id 0\n",
"repository_name 0\n", "repository_name 0\n",
"additional_name 0\n", "additional_name 3569\n",
"repository_url 0\n", "repository_url 0\n",
"description 282\n", "description 282\n",
"type 0\n", "type 0\n",
"update_date 0\n", "update_date 0\n",
"start_date 0\n", "start_date 0\n",
"subject 0\n", "subject 165\n",
"content_type 0\n", "content_type 144\n",
"institution 0\n", "institution 0\n",
"metadata_policy 0\n", "metadata_policy 0\n",
"data_policy 0\n", "data_policy 0\n",
"submission_policy 0\n", "submission_policy 0\n",
"content_policy 0\n", "content_policy 0\n",
"software 0\n", "software 0\n",
"api 0\n", "api 0\n",
"dtype: int64" "dtype: int64"
] ]
}, },
"execution_count": 5, "execution_count": 30,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -713,6 +753,13 @@
"opendoar_df.isna().sum()" "opendoar_df.isna().sum()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,