added simple checks for across registrations

This commit is contained in:
Andrea Mannocci 2021-10-06 15:22:23 +02:00
parent 264f527fcb
commit 74bb9edd04
1 changed files with 234 additions and 138 deletions

View File

@ -1448,8 +1448,8 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n", " <td>[opendoar, celestial]</td>\n",
" <td>[58, 669]</td>\n", " <td>[669, 58]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -1527,7 +1527,7 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n", " <td>[opendoar, celestial]</td>\n",
" <td>[526, 258]</td>\n", " <td>[526, 258]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -1663,8 +1663,8 @@
" <td>TRUE</td>\n", " <td>TRUE</td>\n",
" <td>TRUE</td>\n", " <td>TRUE</td>\n",
" <td>TRUE</td>\n", " <td>TRUE</td>\n",
" <td>[Climate Service Center 2.0, KLIMZUG projects,...</td>\n", " <td>[Helmholtz-Zentrum Geesthacht, Climate Service...</td>\n",
" <td>[http://www.klimzug.de/de/94.php, http://www.c...</td>\n", " <td>[http://www.climateservicecenter.de/, http://w...</td>\n",
" <td>de</td>\n", " <td>de</td>\n",
" <td>Hamburg</td>\n", " <td>Hamburg</td>\n",
" <td>53.5511</td>\n", " <td>53.5511</td>\n",
@ -1672,7 +1672,7 @@
" <td>opus</td>\n", " <td>opus</td>\n",
" <td>geoname_2_DE</td>\n", " <td>geoname_2_DE</td>\n",
" <td>other</td>\n", " <td>other</td>\n",
" <td>[GE, GF, S1, HD, G1]</td>\n", " <td>[GE, S1, G1, HD, GF]</td>\n",
" <td>2015-07-02 08:08:31</td>\n", " <td>2015-07-02 08:08:31</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -1685,7 +1685,7 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n", " <td>[opendoar, celestial]</td>\n",
" <td>[5881, 3408]</td>\n", " <td>[5881, 3408]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -1893,14 +1893,14 @@
"0 NaN \n", "0 NaN \n",
"1 NaN \n", "1 NaN \n",
"2 NaN \n", "2 NaN \n",
"3 [Climate Service Center 2.0, KLIMZUG projects,... \n", "3 [Helmholtz-Zentrum Geesthacht, Climate Service... \n",
"4 Skidmore College \n", "4 Skidmore College \n",
"\n", "\n",
" roar_organisation_home_page roar_location_country \\\n", " roar_organisation_home_page roar_location_country \\\n",
"0 NaN fr \n", "0 NaN fr \n",
"1 NaN se \n", "1 NaN se \n",
"2 NaN pt \n", "2 NaN pt \n",
"3 [http://www.klimzug.de/de/94.php, http://www.c... de \n", "3 [http://www.climateservicecenter.de/, http://w... de \n",
"4 http://www.skidmore.edu/ us \n", "4 http://www.skidmore.edu/ us \n",
"\n", "\n",
" roar_location_city roar_location_latitude roar_location_longitude \\\n", " roar_location_city roar_location_latitude roar_location_longitude \\\n",
@ -1914,7 +1914,7 @@
"0 hal geoname_2_FR other NaN \n", "0 hal geoname_2_FR other NaN \n",
"1 diva geoname_2_SE other NaN \n", "1 diva geoname_2_SE other NaN \n",
"2 dspace geoname_2_PT other NaN \n", "2 dspace geoname_2_PT other NaN \n",
"3 opus geoname_2_DE other [GE, GF, S1, HD, G1] \n", "3 opus geoname_2_DE other [GE, S1, G1, HD, GF] \n",
"4 bepress geoname_2_US other NaN \n", "4 bepress geoname_2_US other NaN \n",
"\n", "\n",
" roar_date roar_note roar_suggestions roar_activity_low \\\n", " roar_date roar_note roar_suggestions roar_activity_low \\\n",
@ -1946,10 +1946,10 @@
"4 NaN NaN NaN \n", "4 NaN NaN NaN \n",
"\n", "\n",
" roar_registry_name roar_registry_id roar_submit_to \\\n", " roar_registry_name roar_registry_id roar_submit_to \\\n",
"0 [celestial, opendoar] [58, 669] NaN \n", "0 [opendoar, celestial] [669, 58] NaN \n",
"1 [celestial, opendoar] [526, 258] NaN \n", "1 [opendoar, celestial] [526, 258] NaN \n",
"2 NaN NaN NaN \n", "2 NaN NaN NaN \n",
"3 [celestial, opendoar] [5881, 3408] NaN \n", "3 [opendoar, celestial] [5881, 3408] NaN \n",
"4 celestial 5882 NaN \n", "4 celestial 5882 NaN \n",
"\n", "\n",
" roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n", " roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n",
@ -2163,7 +2163,7 @@
" <td>dspace</td>\n", " <td>dspace</td>\n",
" <td>geoname_2_UA</td>\n", " <td>geoname_2_UA</td>\n",
" <td>other</td>\n", " <td>other</td>\n",
" <td>[H1, D1, D204, BL, DK, D901, AC, BF, L1, BS, H...</td>\n", " <td>[B1, BS, BL, AC, D204, DK, HM, BF, L1, D1, H1,...</td>\n",
" <td>2015-07-07 12:38:37</td>\n", " <td>2015-07-07 12:38:37</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -2176,8 +2176,8 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n", " <td>[opendoar, celestial]</td>\n",
" <td>[3410, 5883]</td>\n", " <td>[5883, 3410]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -2245,7 +2245,7 @@
"7 NaN NaN dspace geoname_2_UA \n", "7 NaN NaN dspace geoname_2_UA \n",
"\n", "\n",
" roar_version roar_subjects \\\n", " roar_version roar_subjects \\\n",
"7 other [H1, D1, D204, BL, DK, D901, AC, BF, L1, BS, H... \n", "7 other [B1, BS, BL, AC, D204, DK, HM, BF, L1, D1, H1,... \n",
"\n", "\n",
" roar_date roar_note roar_suggestions roar_activity_low \\\n", " roar_date roar_note roar_suggestions roar_activity_low \\\n",
"7 2015-07-07 12:38:37 NaN NaN NaN \n", "7 2015-07-07 12:38:37 NaN NaN NaN \n",
@ -2257,7 +2257,7 @@
"7 NaN NaN NaN \n", "7 NaN NaN NaN \n",
"\n", "\n",
" roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to \\\n", " roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to \\\n",
"7 NaN [celestial, opendoar] [3410, 5883] NaN \n", "7 NaN [opendoar, celestial] [5883, 3410] NaN \n",
"\n", "\n",
" roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n", " roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n",
"7 NaN NaN NaN \n", "7 NaN NaN NaN \n",
@ -2417,15 +2417,6 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [
"dup.groupby('dedup_id').aggregate(list).reset_index()[['dedup_id', 'name']].to_csv('../data/interim/to_validate.csv')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
@ -2517,7 +2508,7 @@
"freq 4 1977 1 " "freq 4 1977 1 "
] ]
}, },
"execution_count": 9, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2526,6 +2517,120 @@
"dup.describe()" "dup.describe()"
] ]
}, },
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"dup_grouped = dup.groupby('dedup_id').aggregate(list)\n",
"dup_grouped['source_set'] = dup_grouped.source.map(set)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duplicate_id 6\n",
"original_id 6\n",
"name 6\n",
"source 6\n",
"unique_id 6\n",
"source_set 6\n",
"dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup_grouped[dup_grouped.source_set.str.len() == 4].count()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duplicate_id 60\n",
"original_id 60\n",
"name 60\n",
"source 60\n",
"unique_id 60\n",
"source_set 60\n",
"dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup_grouped[dup_grouped.source_set.str.len() == 3].count()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duplicate_id 1986\n",
"original_id 1986\n",
"name 1986\n",
"source 1986\n",
"unique_id 1986\n",
"source_set 1986\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup_grouped[dup_grouped.source_set.str.len() == 2].count()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duplicate_id 139\n",
"original_id 139\n",
"name 139\n",
"source 139\n",
"unique_id 139\n",
"source_set 139\n",
"dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup_grouped[dup_grouped.source_set.str.len() == 1].count()"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@ -2535,7 +2640,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2676,7 +2781,7 @@
"dedup::03e0704b5690a2dee1861dc3ad3316c9 {roar} " "dedup::03e0704b5690a2dee1861dc3ad3316c9 {roar} "
] ]
}, },
"execution_count": 10, "execution_count": 14,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2690,7 +2795,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2831,7 +2936,7 @@
"dedup::03e0704b5690a2dee1861dc3ad3316c9 roar " "dedup::03e0704b5690a2dee1861dc3ad3316c9 roar "
] ]
}, },
"execution_count": 11, "execution_count": 15,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2843,7 +2948,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2919,7 +3024,7 @@
"roar 121 121 121 121 121" "roar 121 121 121 121 121"
] ]
}, },
"execution_count": 12, "execution_count": 16,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2930,7 +3035,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3097,7 +3202,7 @@
"[287 rows x 6 columns]" "[287 rows x 6 columns]"
] ]
}, },
"execution_count": 13, "execution_count": 17,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -3116,7 +3221,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 18,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3309,7 +3414,7 @@
"[440 rows x 6 columns]" "[440 rows x 6 columns]"
] ]
}, },
"execution_count": 14, "execution_count": 18,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -3333,7 +3438,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 19,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3526,7 +3631,7 @@
"[3890 rows x 6 columns]" "[3890 rows x 6 columns]"
] ]
}, },
"execution_count": 15, "execution_count": 19,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -3547,7 +3652,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 20,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3562,7 +3667,7 @@
"dtype: int64" "dtype: int64"
] ]
}, },
"execution_count": 16, "execution_count": 20,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -3573,7 +3678,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 21,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3588,7 +3693,7 @@
"dtype: int64" "dtype: int64"
] ]
}, },
"execution_count": 17, "execution_count": 21,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -3599,7 +3704,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 22,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3608,7 +3713,7 @@
"2191" "2191"
] ]
}, },
"execution_count": 18, "execution_count": 22,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -3619,7 +3724,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 23,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -3628,7 +3733,7 @@
"2191" "2191"
] ]
}, },
"execution_count": 19, "execution_count": 23,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -3637,15 +3742,6 @@
"dup.groupby('dedup_id').ngroups" "dup.groupby('dedup_id').ngroups"
] ]
}, },
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"dup_within.to_csv('../data/processed/dup_within.csv')"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@ -3655,7 +3751,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 24,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -4188,8 +4284,8 @@
" <td>TRUE</td>\n", " <td>TRUE</td>\n",
" <td>TRUE</td>\n", " <td>TRUE</td>\n",
" <td>FALSE</td>\n", " <td>FALSE</td>\n",
" <td>[University of Sumatera Utara, USU Library]</td>\n", " <td>[USU Library, University of Sumatera Utara]</td>\n",
" <td>[http://www.usu.ac.id, http://library.usu.ac.id]</td>\n", " <td>[http://library.usu.ac.id, http://www.usu.ac.id]</td>\n",
" <td>id</td>\n", " <td>id</td>\n",
" <td>Medan</td>\n", " <td>Medan</td>\n",
" <td>3.5595</td>\n", " <td>3.5595</td>\n",
@ -4210,8 +4306,8 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>[roarmap, celestial, opendoar]</td>\n", " <td>[opendoar, roarmap, celestial]</td>\n",
" <td>[2101, 1717, 283]</td>\n", " <td>[1717, 2101, 283]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -4379,7 +4475,7 @@
" <td>dspace</td>\n", " <td>dspace</td>\n",
" <td>geoname_2_ZA</td>\n", " <td>geoname_2_ZA</td>\n",
" <td>other</td>\n", " <td>other</td>\n",
" <td>[AS, B1, AI]</td>\n", " <td>[B1, AI, AS]</td>\n",
" <td>2015-02-10 06:35:50</td>\n", " <td>2015-02-10 06:35:50</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -4574,7 +4670,7 @@
" <td>0</td>\n", " <td>0</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>0</td>\n", " <td>0</td>\n",
" <td>[celestial, opendoar]</td>\n", " <td>[opendoar, celestial]</td>\n",
" <td>[1779, 1627]</td>\n", " <td>[1779, 1627]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -5205,14 +5301,14 @@
"\n", "\n",
" roar_open_access roar_mandate roar_organisation_title \\\n", " roar_open_access roar_mandate roar_organisation_title \\\n",
"0 FALSE TRUE Università degli Studi di Milano \n", "0 FALSE TRUE Università degli Studi di Milano \n",
"1 TRUE FALSE [University of Sumatera Utara, USU Library] \n", "1 TRUE FALSE [USU Library, University of Sumatera Utara] \n",
"2 TRUE FALSE National Research Foundation of South Africa \n", "2 TRUE FALSE National Research Foundation of South Africa \n",
"3 NaN NaN Swansea Metropolitan University \n", "3 NaN NaN Swansea Metropolitan University \n",
"4 FALSE FALSE NaN \n", "4 FALSE FALSE NaN \n",
"\n", "\n",
" roar_organisation_home_page roar_location_country \\\n", " roar_organisation_home_page roar_location_country \\\n",
"0 http://www.unimi.it it \n", "0 http://www.unimi.it it \n",
"1 [http://www.usu.ac.id, http://library.usu.ac.id] id \n", "1 [http://library.usu.ac.id, http://www.usu.ac.id] id \n",
"2 http://www.nrf.ac.za/ za \n", "2 http://www.nrf.ac.za/ za \n",
"3 http://www.smu.ac.uk/ gb \n", "3 http://www.smu.ac.uk/ gb \n",
"4 NaN pe \n", "4 NaN pe \n",
@ -5227,7 +5323,7 @@
" roar_software roar_geoname roar_version roar_subjects roar_date \\\n", " roar_software roar_geoname roar_version roar_subjects roar_date \\\n",
"0 dspace geoname_2_IT other NaN 2014-05-04 17:40:53 \n", "0 dspace geoname_2_IT other NaN 2014-05-04 17:40:53 \n",
"1 dspace geoname_2_ID other NaN 2010-01-15 10:09:25 \n", "1 dspace geoname_2_ID other NaN 2010-01-15 10:09:25 \n",
"2 dspace geoname_2_ZA other [AS, B1, AI] 2015-02-10 06:35:50 \n", "2 dspace geoname_2_ZA other [B1, AI, AS] 2015-02-10 06:35:50 \n",
"3 dspace geoname_2_GB other NaN 2008-05-15 11:29:17 \n", "3 dspace geoname_2_GB other NaN 2008-05-15 11:29:17 \n",
"4 dspace geoname_2_PE other NaN 2019-09-02 21:20:31 \n", "4 dspace geoname_2_PE other NaN 2019-09-02 21:20:31 \n",
"\n", "\n",
@ -5261,9 +5357,9 @@
"\n", "\n",
" roar_registry_name roar_registry_id \\\n", " roar_registry_name roar_registry_id \\\n",
"0 celestial 1596 \n", "0 celestial 1596 \n",
"1 [roarmap, celestial, opendoar] [2101, 1717, 283] \n", "1 [opendoar, roarmap, celestial] [1717, 2101, 283] \n",
"2 roarmap NaN \n", "2 roarmap NaN \n",
"3 [celestial, opendoar] [1779, 1627] \n", "3 [opendoar, celestial] [1779, 1627] \n",
"4 opendoar http://v2.sherpa.ac.uk/id/repository/4422 \n", "4 opendoar http://v2.sherpa.ac.uk/id/repository/4422 \n",
"\n", "\n",
" roar_submit_to roar_submitted_to_name roar_submitted_to_done \\\n", " roar_submit_to roar_submitted_to_name roar_submitted_to_done \\\n",
@ -5295,7 +5391,7 @@
"4 NaN NaN roar_15142 " "4 NaN NaN roar_15142 "
] ]
}, },
"execution_count": 21, "execution_count": 24,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -5310,7 +5406,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 25,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -5865,7 +5961,7 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n", " <td>[opendoar, celestial]</td>\n",
" <td>[1832, 1149]</td>\n", " <td>[1832, 1149]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -6229,8 +6325,8 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n", " <td>[opendoar, celestial]</td>\n",
" <td>[2545, 5072]</td>\n", " <td>[5072, 2545]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -6398,7 +6494,7 @@
" <td>opus</td>\n", " <td>opus</td>\n",
" <td>geoname_2_DE</td>\n", " <td>geoname_2_DE</td>\n",
" <td>other</td>\n", " <td>other</td>\n",
" <td>[GE, T1, HB]</td>\n", " <td>[HB, GE, T1]</td>\n",
" <td>2016-04-28 13:58:38</td>\n", " <td>2016-04-28 13:58:38</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>please delete ID 5891</td>\n", " <td>please delete ID 5891</td>\n",
@ -6411,8 +6507,8 @@
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n", " <td>[opendoar, celestial]</td>\n",
" <td>[2539, 6112]</td>\n", " <td>[6112, 2539]</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
" <td>NaN</td>\n", " <td>NaN</td>\n",
@ -6926,7 +7022,7 @@
"1 other NaN 2005-06-07 12:57:08 NaN \n", "1 other NaN 2005-06-07 12:57:08 NaN \n",
"2 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n",
"3 other NaN 2012-08-05 15:12:12 NaN \n", "3 other NaN 2012-08-05 15:12:12 NaN \n",
"4 other [GE, T1, HB] 2016-04-28 13:58:38 NaN \n", "4 other [HB, GE, T1] 2016-04-28 13:58:38 NaN \n",
"\n", "\n",
" roar_suggestions roar_activity_low roar_activity_medium \\\n", " roar_suggestions roar_activity_low roar_activity_medium \\\n",
"0 NaN NaN NaN \n", "0 NaN NaN NaN \n",
@ -6958,10 +7054,10 @@
"\n", "\n",
" roar_registry_name roar_registry_id roar_submit_to \\\n", " roar_registry_name roar_registry_id roar_submit_to \\\n",
"0 NaN NaN NaN \n", "0 NaN NaN NaN \n",
"1 [celestial, opendoar] [1832, 1149] NaN \n", "1 [opendoar, celestial] [1832, 1149] NaN \n",
"2 NaN NaN NaN \n", "2 NaN NaN NaN \n",
"3 [celestial, opendoar] [2545, 5072] NaN \n", "3 [opendoar, celestial] [5072, 2545] NaN \n",
"4 [celestial, opendoar] [2539, 6112] NaN \n", "4 [opendoar, celestial] [6112, 2539] NaN \n",
"\n", "\n",
" roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n", " roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n",
"0 NaN NaN NaN \n", "0 NaN NaN NaN \n",
@ -6992,7 +7088,7 @@
"4 NaN NaN roar_11212 " "4 NaN NaN roar_11212 "
] ]
}, },
"execution_count": 22, "execution_count": 25,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -7007,7 +7103,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 26,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -8668,7 +8764,7 @@
"4 NaN NaN " "4 NaN NaN "
] ]
}, },
"execution_count": 23, "execution_count": 26,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -8683,18 +8779,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 27,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"<ipython-input-24-3881fa0a0224>:1: PerformanceWarning:\n", "<ipython-input-27-3881fa0a0224>:1: PerformanceWarning:\n",
"\n", "\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n", "\n",
"<ipython-input-24-3881fa0a0224>:2: PerformanceWarning:\n", "<ipython-input-27-3881fa0a0224>:2: PerformanceWarning:\n",
"\n", "\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n" "\n"
@ -9058,7 +9154,7 @@
" <td>[dspace, dspace]</td>\n", " <td>[dspace, dspace]</td>\n",
" <td>[geoname_2_IN, geoname_2_IN]</td>\n", " <td>[geoname_2_IN, geoname_2_IN]</td>\n",
" <td>[other, other]</td>\n", " <td>[other, other]</td>\n",
" <td>[[TK, TJ, TN, TH, TP, TD, TA], [T1, TA]]</td>\n", " <td>[[TN, TA, TK, TH, TP, TD, TJ], [TA, T1]]</td>\n",
" <td>[2011-12-15 09:01:35, 2012-01-05 12:09:37]</td>\n", " <td>[2011-12-15 09:01:35, 2012-01-05 12:09:37]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
@ -9241,7 +9337,7 @@
" <td>[eprints, eprints]</td>\n", " <td>[eprints, eprints]</td>\n",
" <td>[geoname_2_IN, geoname_2_IN]</td>\n", " <td>[geoname_2_IN, geoname_2_IN]</td>\n",
" <td>[3.3.15 eps, 3.3.15 eps]</td>\n", " <td>[3.3.15 eps, 3.3.15 eps]</td>\n",
" <td>[[RB, RM], [R1, RZ]]</td>\n", " <td>[[RM, RB], [R1, RZ]]</td>\n",
" <td>[2014-03-07 15:07:45, 2014-03-19 07:05:04]</td>\n", " <td>[2014-03-07 15:07:45, 2014-03-19 07:05:04]</td>\n",
" <td>[The National Institute for Research in Tuberc...</td>\n", " <td>[The National Institute for Research in Tuberc...</td>\n",
" <td>[nan, Please include \"Tuberculosis\" as a Speci...</td>\n", " <td>[nan, Please include \"Tuberculosis\" as a Speci...</td>\n",
@ -9254,7 +9350,7 @@
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[[celestial, opendoar], celestial]</td>\n", " <td>[[opendoar, celestial], celestial]</td>\n",
" <td>[[5410, 2725], 5430]</td>\n", " <td>[[5410, 2725], 5430]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
@ -9437,7 +9533,7 @@
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[[celestial, opendoar], opendoar, opendoar]</td>\n", " <td>[[opendoar, celestial], opendoar, opendoar]</td>\n",
" <td>[[1781, 2426], 1781, 1807]</td>\n", " <td>[[1781, 2426], 1781, 1807]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
@ -9803,8 +9899,8 @@
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[[roarmap, celestial, opendoar], [celestial, o...</td>\n", " <td>[[opendoar, roarmap, celestial], [opendoar, ce...</td>\n",
" <td>[[1441, 193, 1456], [1441, 1456]]</td>\n", " <td>[[193, 1456, 1441], [1456, 1441]]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
@ -10343,8 +10439,8 @@
"4 [geoname_2_HK, geoname_2_CN] [other, other] \n", "4 [geoname_2_HK, geoname_2_CN] [other, other] \n",
"\n", "\n",
" roar_subjects \\\n", " roar_subjects \\\n",
"0 [[TK, TJ, TN, TH, TP, TD, TA], [T1, TA]] \n", "0 [[TN, TA, TK, TH, TP, TD, TJ], [TA, T1]] \n",
"1 [[RB, RM], [R1, RZ]] \n", "1 [[RM, RB], [R1, RZ]] \n",
"2 [nan, nan, nan] \n", "2 [nan, nan, nan] \n",
"3 [nan, nan] \n", "3 [nan, nan] \n",
"4 [nan, nan] \n", "4 [nan, nan] \n",
@ -10393,17 +10489,17 @@
"\n", "\n",
" roar_registry_name \\\n", " roar_registry_name \\\n",
"0 [celestial, celestial] \n", "0 [celestial, celestial] \n",
"1 [[celestial, opendoar], celestial] \n", "1 [[opendoar, celestial], celestial] \n",
"2 [[celestial, opendoar], opendoar, opendoar] \n", "2 [[opendoar, celestial], opendoar, opendoar] \n",
"3 [celestial, celestial] \n", "3 [celestial, celestial] \n",
"4 [[roarmap, celestial, opendoar], [celestial, o... \n", "4 [[opendoar, roarmap, celestial], [opendoar, ce... \n",
"\n", "\n",
" roar_registry_id roar_submit_to roar_submitted_to_name \\\n", " roar_registry_id roar_submit_to roar_submitted_to_name \\\n",
"0 [4790, 4789] [nan, nan] [nan, nan] \n", "0 [4790, 4789] [nan, nan] [nan, nan] \n",
"1 [[5410, 2725], 5430] [nan, nan] [nan, nan] \n", "1 [[5410, 2725], 5430] [nan, nan] [nan, nan] \n",
"2 [[1781, 2426], 1781, 1807] [nan, nan, nan] [nan, nan, nan] \n", "2 [[1781, 2426], 1781, 1807] [nan, nan, nan] [nan, nan, nan] \n",
"3 [4715, 4715] [nan, nan] [nan, nan] \n", "3 [4715, 4715] [nan, nan] [nan, nan] \n",
"4 [[1441, 193, 1456], [1441, 1456]] [nan, nan] [nan, nan] \n", "4 [[193, 1456, 1441], [1456, 1441]] [nan, nan] [nan, nan] \n",
"\n", "\n",
" roar_submitted_to_done roar_webometrics_rank roar_webometrics_size \\\n", " roar_submitted_to_done roar_webometrics_rank roar_webometrics_size \\\n",
"0 [nan, nan] [nan, nan] [nan, nan] \n", "0 [nan, nan] [nan, nan] [nan, nan] \n",
@ -10434,7 +10530,7 @@
"4 [nan, nan] [roar_1019, roar_5550] {roar} " "4 [nan, nan] [roar_1019, roar_5550] {roar} "
] ]
}, },
"execution_count": 24, "execution_count": 27,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -10447,18 +10543,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 28,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"<ipython-input-25-89649d18870f>:1: PerformanceWarning:\n", "<ipython-input-28-89649d18870f>:1: PerformanceWarning:\n",
"\n", "\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n", "\n",
"<ipython-input-25-89649d18870f>:2: PerformanceWarning:\n", "<ipython-input-28-89649d18870f>:2: PerformanceWarning:\n",
"\n", "\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n" "\n"
@ -10822,7 +10918,7 @@
" <td>[eprints, nan, nan]</td>\n", " <td>[eprints, nan, nan]</td>\n",
" <td>[geoname_2_MX, nan, nan]</td>\n", " <td>[geoname_2_MX, nan, nan]</td>\n",
" <td>[3.3.15 eps, nan, nan]</td>\n", " <td>[3.3.15 eps, nan, nan]</td>\n",
" <td>[[H1, HX, GF, HC, HD, HT, HB, HM, G1, JA, T1, ...</td>\n", " <td>[[HB, HD, HT, GF, HC, HF, HJ, HN, HX, H1, HG, ...</td>\n",
" <td>[2012-02-03 05:18:16, nan, nan]</td>\n", " <td>[2012-02-03 05:18:16, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
@ -10835,7 +10931,7 @@
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[[celestial, opendoar], nan, nan]</td>\n", " <td>[[opendoar, celestial], nan, nan]</td>\n",
" <td>[[4818, 2429], nan, nan]</td>\n", " <td>[[4818, 2429], nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
@ -11018,8 +11114,8 @@
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, [celestial, opendoar]]</td>\n", " <td>[nan, nan, [opendoar, celestial]]</td>\n",
" <td>[nan, nan, [5621, 3087]]</td>\n", " <td>[nan, nan, [3087, 5621]]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
@ -11201,8 +11297,8 @@
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, [celestial, opendoar], [celestial, opend...</td>\n", " <td>[nan, [opendoar, celestial], [opendoar, celest...</td>\n",
" <td>[nan, [4672, 2318], [4672, 2318]]</td>\n", " <td>[nan, [2318, 4672], [2318, 4672]]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
@ -11384,7 +11480,7 @@
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[[celestial, opendoar], [celestial, opendoar],...</td>\n", " <td>[[opendoar, celestial], [opendoar, celestial],...</td>\n",
" <td>[[1509, 1430], [1509, 1430], nan]</td>\n", " <td>[[1509, 1430], [1509, 1430], nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
@ -11567,8 +11663,8 @@
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[celestial, [celestial, opendoar], nan]</td>\n", " <td>[celestial, [opendoar, celestial], nan]</td>\n",
" <td>[4668, [2306, 4668], nan]</td>\n", " <td>[4668, [4668, 2306], nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n", " <td>[nan, nan, nan]</td>\n",
@ -12156,7 +12252,7 @@
"4 [geoname_2_CN, geoname_2_CN, nan] [other, other, nan] \n", "4 [geoname_2_CN, geoname_2_CN, nan] [other, other, nan] \n",
"\n", "\n",
" roar_subjects \\\n", " roar_subjects \\\n",
"0 [[H1, HX, GF, HC, HD, HT, HB, HM, G1, JA, T1, ... \n", "0 [[HB, HD, HT, GF, HC, HF, HJ, HN, HX, H1, HG, ... \n",
"1 [nan, nan, nan] \n", "1 [nan, nan, nan] \n",
"2 [nan, nan, nan] \n", "2 [nan, nan, nan] \n",
"3 [nan, nan, nan] \n", "3 [nan, nan, nan] \n",
@ -12191,18 +12287,18 @@
"4 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] \n", "4 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] \n",
"\n", "\n",
" roar_fulltexts_rdocs roar_registry_name \\\n", " roar_fulltexts_rdocs roar_registry_name \\\n",
"0 [nan, nan, nan] [[celestial, opendoar], nan, nan] \n", "0 [nan, nan, nan] [[opendoar, celestial], nan, nan] \n",
"1 [nan, nan, nan] [nan, nan, [celestial, opendoar]] \n", "1 [nan, nan, nan] [nan, nan, [opendoar, celestial]] \n",
"2 [nan, nan, nan] [nan, [celestial, opendoar], [celestial, opend... \n", "2 [nan, nan, nan] [nan, [opendoar, celestial], [opendoar, celest... \n",
"3 [nan, nan, nan] [[celestial, opendoar], [celestial, opendoar],... \n", "3 [nan, nan, nan] [[opendoar, celestial], [opendoar, celestial],... \n",
"4 [nan, nan, nan] [celestial, [celestial, opendoar], nan] \n", "4 [nan, nan, nan] [celestial, [opendoar, celestial], nan] \n",
"\n", "\n",
" roar_registry_id roar_submit_to roar_submitted_to_name \\\n", " roar_registry_id roar_submit_to roar_submitted_to_name \\\n",
"0 [[4818, 2429], nan, nan] [nan, nan, nan] [nan, nan, nan] \n", "0 [[4818, 2429], nan, nan] [nan, nan, nan] [nan, nan, nan] \n",
"1 [nan, nan, [5621, 3087]] [nan, nan, nan] [nan, nan, nan] \n", "1 [nan, nan, [3087, 5621]] [nan, nan, nan] [nan, nan, nan] \n",
"2 [nan, [4672, 2318], [4672, 2318]] [nan, nan, nan] [nan, nan, nan] \n", "2 [nan, [2318, 4672], [2318, 4672]] [nan, nan, nan] [nan, nan, nan] \n",
"3 [[1509, 1430], [1509, 1430], nan] [nan, nan, nan] [nan, nan, nan] \n", "3 [[1509, 1430], [1509, 1430], nan] [nan, nan, nan] [nan, nan, nan] \n",
"4 [4668, [2306, 4668], nan] [nan, nan, nan] [nan, nan, nan] \n", "4 [4668, [4668, 2306], nan] [nan, nan, nan] [nan, nan, nan] \n",
"\n", "\n",
" roar_submitted_to_done roar_webometrics_rank roar_webometrics_size \\\n", " roar_submitted_to_done roar_webometrics_rank roar_webometrics_size \\\n",
"0 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] \n", "0 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] \n",
@ -12233,7 +12329,7 @@
"4 [nan, nan, nan] [roar_4379, roar_4266, nan] {OpenDOAR, roar} " "4 [nan, nan, nan] [roar_4379, roar_4266, nan] {OpenDOAR, roar} "
] ]
}, },
"execution_count": 25, "execution_count": 28,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -12246,18 +12342,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"<ipython-input-26-7abf9225ca42>:1: PerformanceWarning:\n", "<ipython-input-29-7abf9225ca42>:1: PerformanceWarning:\n",
"\n", "\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n", "\n",
"<ipython-input-26-7abf9225ca42>:2: PerformanceWarning:\n", "<ipython-input-29-7abf9225ca42>:2: PerformanceWarning:\n",
"\n", "\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n" "\n"
@ -12648,7 +12744,7 @@
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>{re3data, FAIRsharing}</td>\n", " <td>{FAIRsharing, re3data}</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>1</th>\n",
@ -12804,7 +12900,7 @@
" <td>[nan, dspace]</td>\n", " <td>[nan, dspace]</td>\n",
" <td>[nan, geoname_2_UA]</td>\n", " <td>[nan, geoname_2_UA]</td>\n",
" <td>[nan, other]</td>\n", " <td>[nan, other]</td>\n",
" <td>[nan, [H1, D1, D204, BL, DK, D901, AC, BF, L1,...</td>\n", " <td>[nan, [B1, BS, BL, AC, D204, DK, HM, BF, L1, D...</td>\n",
" <td>[nan, 2015-07-07 12:38:37]</td>\n", " <td>[nan, 2015-07-07 12:38:37]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
@ -12817,8 +12913,8 @@
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, [celestial, opendoar]]</td>\n", " <td>[nan, [opendoar, celestial]]</td>\n",
" <td>[nan, [3410, 5883]]</td>\n", " <td>[nan, [5883, 3410]]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
@ -13014,7 +13110,7 @@
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>{re3data, FAIRsharing}</td>\n", " <td>{FAIRsharing, re3data}</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>3</th>\n", " <th>3</th>\n",
@ -13366,7 +13462,7 @@
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[[celestial, opendoar], nan]</td>\n", " <td>[[opendoar, celestial], nan]</td>\n",
" <td>[[1426, 1294], nan]</td>\n", " <td>[[1426, 1294], nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n", " <td>[nan, nan]</td>\n",
@ -13991,7 +14087,7 @@
"\n", "\n",
" roar_subjects \\\n", " roar_subjects \\\n",
"0 [nan, nan] \n", "0 [nan, nan] \n",
"1 [nan, [H1, D1, D204, BL, DK, D901, AC, BF, L1,... \n", "1 [nan, [B1, BS, BL, AC, D204, DK, HM, BF, L1, D... \n",
"2 [nan, nan] \n", "2 [nan, nan] \n",
"3 [nan, nan] \n", "3 [nan, nan] \n",
"4 [nan, nan] \n", "4 [nan, nan] \n",
@ -14033,10 +14129,10 @@
"\n", "\n",
" roar_registry_name roar_registry_id \\\n", " roar_registry_name roar_registry_id \\\n",
"0 [nan, nan] [nan, nan] \n", "0 [nan, nan] [nan, nan] \n",
"1 [nan, [celestial, opendoar]] [nan, [3410, 5883]] \n", "1 [nan, [opendoar, celestial]] [nan, [5883, 3410]] \n",
"2 [nan, nan] [nan, nan] \n", "2 [nan, nan] [nan, nan] \n",
"3 [roarmap, nan] [http://roarmap.eprints.org/1046/, nan] \n", "3 [roarmap, nan] [http://roarmap.eprints.org/1046/, nan] \n",
"4 [[celestial, opendoar], nan] [[1426, 1294], nan] \n", "4 [[opendoar, celestial], nan] [[1426, 1294], nan] \n",
"\n", "\n",
" roar_submit_to roar_submitted_to_name roar_submitted_to_done \\\n", " roar_submit_to roar_submitted_to_name roar_submitted_to_done \\\n",
"0 [nan, nan] [nan, nan] [nan, nan] \n", "0 [nan, nan] [nan, nan] [nan, nan] \n",
@ -14067,14 +14163,14 @@
"4 [nan, nan] [nan, nan] [roar_610, nan] \n", "4 [nan, nan] [nan, nan] [roar_610, nan] \n",
"\n", "\n",
" source_set \n", " source_set \n",
"0 {re3data, FAIRsharing} \n", "0 {FAIRsharing, re3data} \n",
"1 {OpenDOAR, roar} \n", "1 {OpenDOAR, roar} \n",
"2 {re3data, FAIRsharing} \n", "2 {FAIRsharing, re3data} \n",
"3 {OpenDOAR, roar} \n", "3 {OpenDOAR, roar} \n",
"4 {OpenDOAR, roar} " "4 {OpenDOAR, roar} "
] ]
}, },
"execution_count": 26, "execution_count": 29,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -14087,7 +14183,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 30,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [