diff --git a/notebooks/01.1-exploration-re3data.ipynb b/notebooks/01.1-exploration-re3data.ipynb
index 5bfe9b5..5057a70 100644
--- a/notebooks/01.1-exploration-re3data.ipynb
+++ b/notebooks/01.1-exploration-re3data.ipynb
@@ -51,7 +51,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -452,7 +452,7 @@
"4 2021-06-11 "
]
},
- "execution_count": 20,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -473,7 +473,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -492,7 +492,7 @@
" dtype='object')"
]
},
- "execution_count": 14,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -503,7 +503,22 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def empty_list_is_nan(cell):\n",
+ " if isinstance(cell, list):\n",
+ " return np.nan if len(cell) == 0 else cell\n",
+ " else:\n",
+ " return cell\n",
+ " \n",
+ "re3data_df = re3data_df.applymap(empty_list_is_nan)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -574,21 +589,21 @@
"
2707 | \n",
" 2707 | \n",
" 2707 | \n",
- " 2707 | \n",
+ " 2137 | \n",
" 2686 | \n",
+ " 829 | \n",
" 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
+ " 2677 | \n",
" 1260 | \n",
" 1248 | \n",
" 1762 | \n",
" 146 | \n",
+ " 2685 | \n",
" 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
- " 2707 | \n",
+ " 2700 | \n",
+ " 2699 | \n",
+ " 2699 | \n",
+ " 2706 | \n",
" 2707 | \n",
" 2707 | \n",
" 2707 | \n",
@@ -616,21 +631,21 @@
" 2707 | \n",
" 2707 | \n",
" 2704 | \n",
- " 2129 | \n",
+ " 2128 | \n",
" 2683 | \n",
- " 829 | \n",
+ " 828 | \n",
" 2705 | \n",
- " 9 | \n",
+ " 8 | \n",
" 1233 | \n",
" 687 | \n",
" 351 | \n",
" 79 | \n",
- " 1368 | \n",
+ " 1367 | \n",
" 2 | \n",
- " 1324 | \n",
- " 5 | \n",
- " 2475 | \n",
- " 2686 | \n",
+ " 1323 | \n",
+ " 4 | \n",
+ " 2474 | \n",
+ " 2685 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
@@ -655,12 +670,12 @@
" \n",
" \n",
" top | \n",
- " re3data_____::d8e2164dd005d3961c23e0762453cfb1 | \n",
- " r3d100010836 | \n",
- " UCLA Social Science Data Archive Dataverse | \n",
- " [] | \n",
+ " re3data_____::4cea5a5ea78542232a51190879756661 | \n",
+ " r3d100011254 | \n",
+ " EarthChem Library | \n",
+ " [IRIS] | \n",
" http://www.jcvi.org/cms/home/ | \n",
- " [] | \n",
+ " [doi:10.17171/1-6] | \n",
" The repository is no longer available. >>>!!!<... | \n",
" [disciplinary] | \n",
" 2 datasets | \n",
@@ -700,9 +715,9 @@
" 1 | \n",
" 1 | \n",
" 2 | \n",
- " 570 | \n",
" 2 | \n",
- " 1878 | \n",
+ " 2 | \n",
+ " 2 | \n",
" 2 | \n",
" 1713 | \n",
" 6 | \n",
@@ -745,48 +760,42 @@
" openaire_id re3data_id \\\n",
"count 2707 2707 \n",
"unique 2707 2707 \n",
- "top re3data_____::d8e2164dd005d3961c23e0762453cfb1 r3d100010836 \n",
+ "top re3data_____::4cea5a5ea78542232a51190879756661 r3d100011254 \n",
"freq 1 1 \n",
"\n",
- " repository_name additional_name \\\n",
- "count 2707 2707 \n",
- "unique 2704 2129 \n",
- "top UCLA Social Science Data Archive Dataverse [] \n",
- "freq 2 570 \n",
+ " repository_name additional_name repository_url \\\n",
+ "count 2707 2137 2686 \n",
+ "unique 2704 2128 2683 \n",
+ "top EarthChem Library [IRIS] http://www.jcvi.org/cms/home/ \n",
+ "freq 2 2 2 \n",
"\n",
- " repository_url repository_id \\\n",
- "count 2686 2707 \n",
- "unique 2683 829 \n",
- "top http://www.jcvi.org/cms/home/ [] \n",
- "freq 2 1878 \n",
+ " repository_id description \\\n",
+ "count 829 2707 \n",
+ "unique 828 2705 \n",
+ "top [doi:10.17171/1-6] The repository is no longer available. >>>!!!<... \n",
+ "freq 2 2 \n",
"\n",
- " description type \\\n",
- "count 2707 2707 \n",
- "unique 2705 9 \n",
- "top The repository is no longer available. >>>!!!<... [disciplinary] \n",
- "freq 2 1713 \n",
- "\n",
- " size update_date start_date end_date \\\n",
- "count 1260 1248 1762 146 \n",
- "unique 1233 687 351 79 \n",
- "top 2 datasets 2019-05-15 2008 2015 \n",
- "freq 6 15 92 11 \n",
+ " type size update_date start_date end_date \\\n",
+ "count 2677 1260 1248 1762 146 \n",
+ "unique 8 1233 687 351 79 \n",
+ "top [disciplinary] 2 datasets 2019-05-15 2008 2015 \n",
+ "freq 1713 6 15 92 11 \n",
"\n",
" subject mission_statement \\\n",
- "count 2707 2707 \n",
- "unique 1368 2 \n",
+ "count 2685 2707 \n",
+ "unique 1367 2 \n",
"top [1 Humanities and Social Sciences, 2 Life Scie... true \n",
"freq 222 2286 \n",
"\n",
" content_type provider_type keyword \\\n",
- "count 2707 2707 2707 \n",
- "unique 1324 5 2475 \n",
+ "count 2700 2699 2699 \n",
+ "unique 1323 4 2474 \n",
"top [Standard office documents] [dataProvider] [multidisciplinary] \n",
"freq 30 1748 190 \n",
"\n",
" institution policy \\\n",
- "count 2707 2707 \n",
- "unique 2686 2 \n",
+ "count 2706 2707 \n",
+ "unique 2685 2 \n",
"top [[National Center for Biotechnology Informatio... true \n",
"freq 6 2394 \n",
"\n",
@@ -827,7 +836,7 @@
"freq 47 "
]
},
- "execution_count": 3,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -838,7 +847,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -847,21 +856,21 @@
"openaire_id 0\n",
"re3data_id 0\n",
"repository_name 0\n",
- "additional_name 0\n",
+ "additional_name 570\n",
"repository_url 21\n",
- "repository_id 0\n",
+ "repository_id 1878\n",
"description 0\n",
- "type 0\n",
+ "type 30\n",
"size 1447\n",
"update_date 1459\n",
"start_date 945\n",
"end_date 2561\n",
- "subject 0\n",
+ "subject 22\n",
"mission_statement 0\n",
- "content_type 0\n",
- "provider_type 0\n",
- "keyword 0\n",
- "institution 0\n",
+ "content_type 7\n",
+ "provider_type 8\n",
+ "keyword 8\n",
+ "institution 1\n",
"policy 0\n",
"database_access 0\n",
"database_license 0\n",
@@ -886,7 +895,7 @@
"dtype: int64"
]
},
- "execution_count": 10,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -897,7 +906,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -911,7 +920,7 @@
" 'Configuration data', 'Networkbased data', nan], dtype=object)"
]
},
- "execution_count": 18,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -922,7 +931,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -931,7 +940,7 @@
"array(['dataProvider', 'serviceProvider', nan], dtype=object)"
]
},
- "execution_count": 19,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -939,13 +948,6 @@
"source": [
"re3data_df.provider_type.explode().unique()"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/notebooks/01.2-exploration-opendoar.ipynb b/notebooks/01.2-exploration-opendoar.ipynb
index dcdad8e..49cc030 100644
--- a/notebooks/01.2-exploration-opendoar.ipynb
+++ b/notebooks/01.2-exploration-opendoar.ipynb
@@ -36,7 +36,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
@@ -262,7 +262,7 @@
"4 true "
]
},
- "execution_count": 3,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -280,7 +280,47 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n",
+ " 'repository_url', 'description', 'type', 'update_date', 'start_date',\n",
+ " 'subject', 'content_type', 'institution', 'metadata_policy',\n",
+ " 'data_policy', 'submission_policy', 'content_policy', 'software',\n",
+ " 'api'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def empty_list_is_nan(cell):\n",
+ " if isinstance(cell, list):\n",
+ " return np.nan if len(cell) == 0 else cell\n",
+ " else:\n",
+ " return cell\n",
+ " \n",
+ "opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
"metadata": {},
"outputs": [
{
@@ -330,14 +370,14 @@
" 5707 | \n",
" 5707.000000 | \n",
" 5707 | \n",
- " 5707 | \n",
+ " 2138 | \n",
" 5707 | \n",
" 5425 | \n",
" 5707 | \n",
" 5707 | \n",
" 5707 | \n",
- " 5707 | \n",
- " 5707 | \n",
+ " 5542 | \n",
+ " 5563 | \n",
" 5707 | \n",
" 5707 | \n",
" 5707 | \n",
@@ -351,14 +391,14 @@
" 5707 | \n",
" NaN | \n",
" 5670 | \n",
- " 2097 | \n",
+ " 2096 | \n",
" 5670 | \n",
" 4622 | \n",
" 4 | \n",
" 2501 | \n",
" 5538 | \n",
- " 820 | \n",
- " 477 | \n",
+ " 819 | \n",
+ " 476 | \n",
" 5098 | \n",
" 2 | \n",
" 2 | \n",
@@ -393,7 +433,7 @@
" 1 | \n",
" NaN | \n",
" 3 | \n",
- " 3569 | \n",
+ " 4 | \n",
" 3 | \n",
" 95 | \n",
" 5067 | \n",
@@ -575,10 +615,10 @@
"max NaN 10175.000000 \n",
"\n",
" repository_name additional_name \\\n",
- "count 5707 5707 \n",
- "unique 5670 2097 \n",
+ "count 5707 2138 \n",
+ "unique 5670 2096 \n",
"top hiroshima associated repository portal [] \n",
- "freq 3 3569 \n",
+ "freq 3 4 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
@@ -614,8 +654,8 @@
"max NaN NaN \n",
"\n",
" update_date start_date subject \\\n",
- "count 5707 5707 5707 \n",
- "unique 2501 5538 820 \n",
+ "count 5707 5707 5542 \n",
+ "unique 2501 5538 819 \n",
"top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n",
"freq 82 82 3212 \n",
"mean NaN NaN NaN \n",
@@ -627,8 +667,8 @@
"max NaN NaN NaN \n",
"\n",
" content_type \\\n",
- "count 5707 \n",
- "unique 477 \n",
+ "count 5563 \n",
+ "unique 476 \n",
"top [theses_and_dissertations] \n",
"freq 460 \n",
"mean NaN \n",
@@ -666,7 +706,7 @@
"max NaN NaN NaN NaN NaN "
]
},
- "execution_count": 4,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@@ -677,34 +717,34 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "openaire_id 0\n",
- "opendoar_id 0\n",
- "repository_name 0\n",
- "additional_name 0\n",
- "repository_url 0\n",
- "description 282\n",
- "type 0\n",
- "update_date 0\n",
- "start_date 0\n",
- "subject 0\n",
- "content_type 0\n",
- "institution 0\n",
- "metadata_policy 0\n",
- "data_policy 0\n",
- "submission_policy 0\n",
- "content_policy 0\n",
- "software 0\n",
- "api 0\n",
+ "openaire_id 0\n",
+ "opendoar_id 0\n",
+ "repository_name 0\n",
+ "additional_name 3569\n",
+ "repository_url 0\n",
+ "description 282\n",
+ "type 0\n",
+ "update_date 0\n",
+ "start_date 0\n",
+ "subject 165\n",
+ "content_type 144\n",
+ "institution 0\n",
+ "metadata_policy 0\n",
+ "data_policy 0\n",
+ "submission_policy 0\n",
+ "content_policy 0\n",
+ "software 0\n",
+ "api 0\n",
"dtype: int64"
]
},
- "execution_count": 5,
+ "execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
@@ -713,6 +753,13 @@
"opendoar_df.isna().sum()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
{
"cell_type": "code",
"execution_count": null,