added simple checks for across registrations

This commit is contained in:
Andrea Mannocci 2021-10-06 15:22:23 +02:00
parent 264f527fcb
commit 74bb9edd04
1 changed files with 234 additions and 138 deletions

View File

@ -1448,8 +1448,8 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[58, 669]</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[669, 58]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -1527,7 +1527,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[526, 258]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -1663,8 +1663,8 @@
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
" <td>[Climate Service Center 2.0, KLIMZUG projects,...</td>\n",
" <td>[http://www.klimzug.de/de/94.php, http://www.c...</td>\n",
" <td>[Helmholtz-Zentrum Geesthacht, Climate Service...</td>\n",
" <td>[http://www.climateservicecenter.de/, http://w...</td>\n",
" <td>de</td>\n",
" <td>Hamburg</td>\n",
" <td>53.5511</td>\n",
@ -1672,7 +1672,7 @@
" <td>opus</td>\n",
" <td>geoname_2_DE</td>\n",
" <td>other</td>\n",
" <td>[GE, GF, S1, HD, G1]</td>\n",
" <td>[GE, S1, G1, HD, GF]</td>\n",
" <td>2015-07-02 08:08:31</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -1685,7 +1685,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[5881, 3408]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -1893,14 +1893,14 @@
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 [Climate Service Center 2.0, KLIMZUG projects,... \n",
"3 [Helmholtz-Zentrum Geesthacht, Climate Service... \n",
"4 Skidmore College \n",
"\n",
" roar_organisation_home_page roar_location_country \\\n",
"0 NaN fr \n",
"1 NaN se \n",
"2 NaN pt \n",
"3 [http://www.klimzug.de/de/94.php, http://www.c... de \n",
"3 [http://www.climateservicecenter.de/, http://w... de \n",
"4 http://www.skidmore.edu/ us \n",
"\n",
" roar_location_city roar_location_latitude roar_location_longitude \\\n",
@ -1914,7 +1914,7 @@
"0 hal geoname_2_FR other NaN \n",
"1 diva geoname_2_SE other NaN \n",
"2 dspace geoname_2_PT other NaN \n",
"3 opus geoname_2_DE other [GE, GF, S1, HD, G1] \n",
"3 opus geoname_2_DE other [GE, S1, G1, HD, GF] \n",
"4 bepress geoname_2_US other NaN \n",
"\n",
" roar_date roar_note roar_suggestions roar_activity_low \\\n",
@ -1946,10 +1946,10 @@
"4 NaN NaN NaN \n",
"\n",
" roar_registry_name roar_registry_id roar_submit_to \\\n",
"0 [celestial, opendoar] [58, 669] NaN \n",
"1 [celestial, opendoar] [526, 258] NaN \n",
"0 [opendoar, celestial] [669, 58] NaN \n",
"1 [opendoar, celestial] [526, 258] NaN \n",
"2 NaN NaN NaN \n",
"3 [celestial, opendoar] [5881, 3408] NaN \n",
"3 [opendoar, celestial] [5881, 3408] NaN \n",
"4 celestial 5882 NaN \n",
"\n",
" roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n",
@ -2163,7 +2163,7 @@
" <td>dspace</td>\n",
" <td>geoname_2_UA</td>\n",
" <td>other</td>\n",
" <td>[H1, D1, D204, BL, DK, D901, AC, BF, L1, BS, H...</td>\n",
" <td>[B1, BS, BL, AC, D204, DK, HM, BF, L1, D1, H1,...</td>\n",
" <td>2015-07-07 12:38:37</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -2176,8 +2176,8 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[3410, 5883]</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[5883, 3410]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -2245,7 +2245,7 @@
"7 NaN NaN dspace geoname_2_UA \n",
"\n",
" roar_version roar_subjects \\\n",
"7 other [H1, D1, D204, BL, DK, D901, AC, BF, L1, BS, H... \n",
"7 other [B1, BS, BL, AC, D204, DK, HM, BF, L1, D1, H1,... \n",
"\n",
" roar_date roar_note roar_suggestions roar_activity_low \\\n",
"7 2015-07-07 12:38:37 NaN NaN NaN \n",
@ -2257,7 +2257,7 @@
"7 NaN NaN NaN \n",
"\n",
" roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to \\\n",
"7 NaN [celestial, opendoar] [3410, 5883] NaN \n",
"7 NaN [opendoar, celestial] [5883, 3410] NaN \n",
"\n",
" roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n",
"7 NaN NaN NaN \n",
@ -2417,15 +2417,6 @@
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"dup.groupby('dedup_id').aggregate(list).reset_index()[['dedup_id', 'name']].to_csv('../data/interim/to_validate.csv')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
@ -2517,7 +2508,7 @@
"freq 4 1977 1 "
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -2526,6 +2517,120 @@
"dup.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"dup_grouped = dup.groupby('dedup_id').aggregate(list)\n",
"dup_grouped['source_set'] = dup_grouped.source.map(set)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duplicate_id 6\n",
"original_id 6\n",
"name 6\n",
"source 6\n",
"unique_id 6\n",
"source_set 6\n",
"dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup_grouped[dup_grouped.source_set.str.len() == 4].count()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duplicate_id 60\n",
"original_id 60\n",
"name 60\n",
"source 60\n",
"unique_id 60\n",
"source_set 60\n",
"dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup_grouped[dup_grouped.source_set.str.len() == 3].count()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duplicate_id 1986\n",
"original_id 1986\n",
"name 1986\n",
"source 1986\n",
"unique_id 1986\n",
"source_set 1986\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup_grouped[dup_grouped.source_set.str.len() == 2].count()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duplicate_id 139\n",
"original_id 139\n",
"name 139\n",
"source 139\n",
"unique_id 139\n",
"source_set 139\n",
"dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup_grouped[dup_grouped.source_set.str.len() == 1].count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -2535,7 +2640,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 14,
"metadata": {},
"outputs": [
{
@ -2676,7 +2781,7 @@
"dedup::03e0704b5690a2dee1861dc3ad3316c9 {roar} "
]
},
"execution_count": 10,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -2690,7 +2795,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 15,
"metadata": {},
"outputs": [
{
@ -2831,7 +2936,7 @@
"dedup::03e0704b5690a2dee1861dc3ad3316c9 roar "
]
},
"execution_count": 11,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@ -2843,7 +2948,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 16,
"metadata": {},
"outputs": [
{
@ -2919,7 +3024,7 @@
"roar 121 121 121 121 121"
]
},
"execution_count": 12,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@ -2930,7 +3035,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 17,
"metadata": {},
"outputs": [
{
@ -3097,7 +3202,7 @@
"[287 rows x 6 columns]"
]
},
"execution_count": 13,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@ -3116,7 +3221,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 18,
"metadata": {},
"outputs": [
{
@ -3309,7 +3414,7 @@
"[440 rows x 6 columns]"
]
},
"execution_count": 14,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@ -3333,7 +3438,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 19,
"metadata": {},
"outputs": [
{
@ -3526,7 +3631,7 @@
"[3890 rows x 6 columns]"
]
},
"execution_count": 15,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@ -3547,7 +3652,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 20,
"metadata": {},
"outputs": [
{
@ -3562,7 +3667,7 @@
"dtype: int64"
]
},
"execution_count": 16,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@ -3573,7 +3678,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 21,
"metadata": {},
"outputs": [
{
@ -3588,7 +3693,7 @@
"dtype: int64"
]
},
"execution_count": 17,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@ -3599,7 +3704,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 22,
"metadata": {},
"outputs": [
{
@ -3608,7 +3713,7 @@
"2191"
]
},
"execution_count": 18,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@ -3619,7 +3724,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 23,
"metadata": {},
"outputs": [
{
@ -3628,7 +3733,7 @@
"2191"
]
},
"execution_count": 19,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@ -3637,15 +3742,6 @@
"dup.groupby('dedup_id').ngroups"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"dup_within.to_csv('../data/processed/dup_within.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -3655,7 +3751,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 24,
"metadata": {},
"outputs": [
{
@ -4188,8 +4284,8 @@
" <td>TRUE</td>\n",
" <td>TRUE</td>\n",
" <td>FALSE</td>\n",
" <td>[University of Sumatera Utara, USU Library]</td>\n",
" <td>[http://www.usu.ac.id, http://library.usu.ac.id]</td>\n",
" <td>[USU Library, University of Sumatera Utara]</td>\n",
" <td>[http://library.usu.ac.id, http://www.usu.ac.id]</td>\n",
" <td>id</td>\n",
" <td>Medan</td>\n",
" <td>3.5595</td>\n",
@ -4210,8 +4306,8 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[roarmap, celestial, opendoar]</td>\n",
" <td>[2101, 1717, 283]</td>\n",
" <td>[opendoar, roarmap, celestial]</td>\n",
" <td>[1717, 2101, 283]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -4379,7 +4475,7 @@
" <td>dspace</td>\n",
" <td>geoname_2_ZA</td>\n",
" <td>other</td>\n",
" <td>[AS, B1, AI]</td>\n",
" <td>[B1, AI, AS]</td>\n",
" <td>2015-02-10 06:35:50</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -4574,7 +4670,7 @@
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[1779, 1627]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -5205,14 +5301,14 @@
"\n",
" roar_open_access roar_mandate roar_organisation_title \\\n",
"0 FALSE TRUE Università degli Studi di Milano \n",
"1 TRUE FALSE [University of Sumatera Utara, USU Library] \n",
"1 TRUE FALSE [USU Library, University of Sumatera Utara] \n",
"2 TRUE FALSE National Research Foundation of South Africa \n",
"3 NaN NaN Swansea Metropolitan University \n",
"4 FALSE FALSE NaN \n",
"\n",
" roar_organisation_home_page roar_location_country \\\n",
"0 http://www.unimi.it it \n",
"1 [http://www.usu.ac.id, http://library.usu.ac.id] id \n",
"1 [http://library.usu.ac.id, http://www.usu.ac.id] id \n",
"2 http://www.nrf.ac.za/ za \n",
"3 http://www.smu.ac.uk/ gb \n",
"4 NaN pe \n",
@ -5227,7 +5323,7 @@
" roar_software roar_geoname roar_version roar_subjects roar_date \\\n",
"0 dspace geoname_2_IT other NaN 2014-05-04 17:40:53 \n",
"1 dspace geoname_2_ID other NaN 2010-01-15 10:09:25 \n",
"2 dspace geoname_2_ZA other [AS, B1, AI] 2015-02-10 06:35:50 \n",
"2 dspace geoname_2_ZA other [B1, AI, AS] 2015-02-10 06:35:50 \n",
"3 dspace geoname_2_GB other NaN 2008-05-15 11:29:17 \n",
"4 dspace geoname_2_PE other NaN 2019-09-02 21:20:31 \n",
"\n",
@ -5261,9 +5357,9 @@
"\n",
" roar_registry_name roar_registry_id \\\n",
"0 celestial 1596 \n",
"1 [roarmap, celestial, opendoar] [2101, 1717, 283] \n",
"1 [opendoar, roarmap, celestial] [1717, 2101, 283] \n",
"2 roarmap NaN \n",
"3 [celestial, opendoar] [1779, 1627] \n",
"3 [opendoar, celestial] [1779, 1627] \n",
"4 opendoar http://v2.sherpa.ac.uk/id/repository/4422 \n",
"\n",
" roar_submit_to roar_submitted_to_name roar_submitted_to_done \\\n",
@ -5295,7 +5391,7 @@
"4 NaN NaN roar_15142 "
]
},
"execution_count": 21,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@ -5310,7 +5406,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 25,
"metadata": {},
"outputs": [
{
@ -5865,7 +5961,7 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[1832, 1149]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -6229,8 +6325,8 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[2545, 5072]</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[5072, 2545]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -6398,7 +6494,7 @@
" <td>opus</td>\n",
" <td>geoname_2_DE</td>\n",
" <td>other</td>\n",
" <td>[GE, T1, HB]</td>\n",
" <td>[HB, GE, T1]</td>\n",
" <td>2016-04-28 13:58:38</td>\n",
" <td>NaN</td>\n",
" <td>please delete ID 5891</td>\n",
@ -6411,8 +6507,8 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[celestial, opendoar]</td>\n",
" <td>[2539, 6112]</td>\n",
" <td>[opendoar, celestial]</td>\n",
" <td>[6112, 2539]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -6926,7 +7022,7 @@
"1 other NaN 2005-06-07 12:57:08 NaN \n",
"2 NaN NaN NaN NaN \n",
"3 other NaN 2012-08-05 15:12:12 NaN \n",
"4 other [GE, T1, HB] 2016-04-28 13:58:38 NaN \n",
"4 other [HB, GE, T1] 2016-04-28 13:58:38 NaN \n",
"\n",
" roar_suggestions roar_activity_low roar_activity_medium \\\n",
"0 NaN NaN NaN \n",
@ -6958,10 +7054,10 @@
"\n",
" roar_registry_name roar_registry_id roar_submit_to \\\n",
"0 NaN NaN NaN \n",
"1 [celestial, opendoar] [1832, 1149] NaN \n",
"1 [opendoar, celestial] [1832, 1149] NaN \n",
"2 NaN NaN NaN \n",
"3 [celestial, opendoar] [2545, 5072] NaN \n",
"4 [celestial, opendoar] [2539, 6112] NaN \n",
"3 [opendoar, celestial] [5072, 2545] NaN \n",
"4 [opendoar, celestial] [6112, 2539] NaN \n",
"\n",
" roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n",
"0 NaN NaN NaN \n",
@ -6992,7 +7088,7 @@
"4 NaN NaN roar_11212 "
]
},
"execution_count": 22,
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@ -7007,7 +7103,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 26,
"metadata": {},
"outputs": [
{
@ -8668,7 +8764,7 @@
"4 NaN NaN "
]
},
"execution_count": 23,
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@ -8683,18 +8779,18 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-24-3881fa0a0224>:1: PerformanceWarning:\n",
"<ipython-input-27-3881fa0a0224>:1: PerformanceWarning:\n",
"\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n",
"<ipython-input-24-3881fa0a0224>:2: PerformanceWarning:\n",
"<ipython-input-27-3881fa0a0224>:2: PerformanceWarning:\n",
"\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n"
@ -9058,7 +9154,7 @@
" <td>[dspace, dspace]</td>\n",
" <td>[geoname_2_IN, geoname_2_IN]</td>\n",
" <td>[other, other]</td>\n",
" <td>[[TK, TJ, TN, TH, TP, TD, TA], [T1, TA]]</td>\n",
" <td>[[TN, TA, TK, TH, TP, TD, TJ], [TA, T1]]</td>\n",
" <td>[2011-12-15 09:01:35, 2012-01-05 12:09:37]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
@ -9241,7 +9337,7 @@
" <td>[eprints, eprints]</td>\n",
" <td>[geoname_2_IN, geoname_2_IN]</td>\n",
" <td>[3.3.15 eps, 3.3.15 eps]</td>\n",
" <td>[[RB, RM], [R1, RZ]]</td>\n",
" <td>[[RM, RB], [R1, RZ]]</td>\n",
" <td>[2014-03-07 15:07:45, 2014-03-19 07:05:04]</td>\n",
" <td>[The National Institute for Research in Tuberc...</td>\n",
" <td>[nan, Please include \"Tuberculosis\" as a Speci...</td>\n",
@ -9254,7 +9350,7 @@
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[[celestial, opendoar], celestial]</td>\n",
" <td>[[opendoar, celestial], celestial]</td>\n",
" <td>[[5410, 2725], 5430]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
@ -9437,7 +9533,7 @@
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[[celestial, opendoar], opendoar, opendoar]</td>\n",
" <td>[[opendoar, celestial], opendoar, opendoar]</td>\n",
" <td>[[1781, 2426], 1781, 1807]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
@ -9803,8 +9899,8 @@
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[[roarmap, celestial, opendoar], [celestial, o...</td>\n",
" <td>[[1441, 193, 1456], [1441, 1456]]</td>\n",
" <td>[[opendoar, roarmap, celestial], [opendoar, ce...</td>\n",
" <td>[[193, 1456, 1441], [1456, 1441]]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
@ -10343,8 +10439,8 @@
"4 [geoname_2_HK, geoname_2_CN] [other, other] \n",
"\n",
" roar_subjects \\\n",
"0 [[TK, TJ, TN, TH, TP, TD, TA], [T1, TA]] \n",
"1 [[RB, RM], [R1, RZ]] \n",
"0 [[TN, TA, TK, TH, TP, TD, TJ], [TA, T1]] \n",
"1 [[RM, RB], [R1, RZ]] \n",
"2 [nan, nan, nan] \n",
"3 [nan, nan] \n",
"4 [nan, nan] \n",
@ -10393,17 +10489,17 @@
"\n",
" roar_registry_name \\\n",
"0 [celestial, celestial] \n",
"1 [[celestial, opendoar], celestial] \n",
"2 [[celestial, opendoar], opendoar, opendoar] \n",
"1 [[opendoar, celestial], celestial] \n",
"2 [[opendoar, celestial], opendoar, opendoar] \n",
"3 [celestial, celestial] \n",
"4 [[roarmap, celestial, opendoar], [celestial, o... \n",
"4 [[opendoar, roarmap, celestial], [opendoar, ce... \n",
"\n",
" roar_registry_id roar_submit_to roar_submitted_to_name \\\n",
"0 [4790, 4789] [nan, nan] [nan, nan] \n",
"1 [[5410, 2725], 5430] [nan, nan] [nan, nan] \n",
"2 [[1781, 2426], 1781, 1807] [nan, nan, nan] [nan, nan, nan] \n",
"3 [4715, 4715] [nan, nan] [nan, nan] \n",
"4 [[1441, 193, 1456], [1441, 1456]] [nan, nan] [nan, nan] \n",
"4 [[193, 1456, 1441], [1456, 1441]] [nan, nan] [nan, nan] \n",
"\n",
" roar_submitted_to_done roar_webometrics_rank roar_webometrics_size \\\n",
"0 [nan, nan] [nan, nan] [nan, nan] \n",
@ -10434,7 +10530,7 @@
"4 [nan, nan] [roar_1019, roar_5550] {roar} "
]
},
"execution_count": 24,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@ -10447,18 +10543,18 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-25-89649d18870f>:1: PerformanceWarning:\n",
"<ipython-input-28-89649d18870f>:1: PerformanceWarning:\n",
"\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n",
"<ipython-input-25-89649d18870f>:2: PerformanceWarning:\n",
"<ipython-input-28-89649d18870f>:2: PerformanceWarning:\n",
"\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n"
@ -10822,7 +10918,7 @@
" <td>[eprints, nan, nan]</td>\n",
" <td>[geoname_2_MX, nan, nan]</td>\n",
" <td>[3.3.15 eps, nan, nan]</td>\n",
" <td>[[H1, HX, GF, HC, HD, HT, HB, HM, G1, JA, T1, ...</td>\n",
" <td>[[HB, HD, HT, GF, HC, HF, HJ, HN, HX, H1, HG, ...</td>\n",
" <td>[2012-02-03 05:18:16, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
@ -10835,7 +10931,7 @@
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[[celestial, opendoar], nan, nan]</td>\n",
" <td>[[opendoar, celestial], nan, nan]</td>\n",
" <td>[[4818, 2429], nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
@ -11018,8 +11114,8 @@
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, [celestial, opendoar]]</td>\n",
" <td>[nan, nan, [5621, 3087]]</td>\n",
" <td>[nan, nan, [opendoar, celestial]]</td>\n",
" <td>[nan, nan, [3087, 5621]]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
@ -11201,8 +11297,8 @@
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, [celestial, opendoar], [celestial, opend...</td>\n",
" <td>[nan, [4672, 2318], [4672, 2318]]</td>\n",
" <td>[nan, [opendoar, celestial], [opendoar, celest...</td>\n",
" <td>[nan, [2318, 4672], [2318, 4672]]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
@ -11384,7 +11480,7 @@
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[[celestial, opendoar], [celestial, opendoar],...</td>\n",
" <td>[[opendoar, celestial], [opendoar, celestial],...</td>\n",
" <td>[[1509, 1430], [1509, 1430], nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
@ -11567,8 +11663,8 @@
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[celestial, [celestial, opendoar], nan]</td>\n",
" <td>[4668, [2306, 4668], nan]</td>\n",
" <td>[celestial, [opendoar, celestial], nan]</td>\n",
" <td>[4668, [4668, 2306], nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
" <td>[nan, nan, nan]</td>\n",
@ -12156,7 +12252,7 @@
"4 [geoname_2_CN, geoname_2_CN, nan] [other, other, nan] \n",
"\n",
" roar_subjects \\\n",
"0 [[H1, HX, GF, HC, HD, HT, HB, HM, G1, JA, T1, ... \n",
"0 [[HB, HD, HT, GF, HC, HF, HJ, HN, HX, H1, HG, ... \n",
"1 [nan, nan, nan] \n",
"2 [nan, nan, nan] \n",
"3 [nan, nan, nan] \n",
@ -12191,18 +12287,18 @@
"4 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] \n",
"\n",
" roar_fulltexts_rdocs roar_registry_name \\\n",
"0 [nan, nan, nan] [[celestial, opendoar], nan, nan] \n",
"1 [nan, nan, nan] [nan, nan, [celestial, opendoar]] \n",
"2 [nan, nan, nan] [nan, [celestial, opendoar], [celestial, opend... \n",
"3 [nan, nan, nan] [[celestial, opendoar], [celestial, opendoar],... \n",
"4 [nan, nan, nan] [celestial, [celestial, opendoar], nan] \n",
"0 [nan, nan, nan] [[opendoar, celestial], nan, nan] \n",
"1 [nan, nan, nan] [nan, nan, [opendoar, celestial]] \n",
"2 [nan, nan, nan] [nan, [opendoar, celestial], [opendoar, celest... \n",
"3 [nan, nan, nan] [[opendoar, celestial], [opendoar, celestial],... \n",
"4 [nan, nan, nan] [celestial, [opendoar, celestial], nan] \n",
"\n",
" roar_registry_id roar_submit_to roar_submitted_to_name \\\n",
"0 [[4818, 2429], nan, nan] [nan, nan, nan] [nan, nan, nan] \n",
"1 [nan, nan, [5621, 3087]] [nan, nan, nan] [nan, nan, nan] \n",
"2 [nan, [4672, 2318], [4672, 2318]] [nan, nan, nan] [nan, nan, nan] \n",
"1 [nan, nan, [3087, 5621]] [nan, nan, nan] [nan, nan, nan] \n",
"2 [nan, [2318, 4672], [2318, 4672]] [nan, nan, nan] [nan, nan, nan] \n",
"3 [[1509, 1430], [1509, 1430], nan] [nan, nan, nan] [nan, nan, nan] \n",
"4 [4668, [2306, 4668], nan] [nan, nan, nan] [nan, nan, nan] \n",
"4 [4668, [4668, 2306], nan] [nan, nan, nan] [nan, nan, nan] \n",
"\n",
" roar_submitted_to_done roar_webometrics_rank roar_webometrics_size \\\n",
"0 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] \n",
@ -12233,7 +12329,7 @@
"4 [nan, nan, nan] [roar_4379, roar_4266, nan] {OpenDOAR, roar} "
]
},
"execution_count": 25,
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@ -12246,18 +12342,18 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-26-7abf9225ca42>:1: PerformanceWarning:\n",
"<ipython-input-29-7abf9225ca42>:1: PerformanceWarning:\n",
"\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n",
"<ipython-input-26-7abf9225ca42>:2: PerformanceWarning:\n",
"<ipython-input-29-7abf9225ca42>:2: PerformanceWarning:\n",
"\n",
"DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
"\n"
@ -12648,7 +12744,7 @@
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>{re3data, FAIRsharing}</td>\n",
" <td>{FAIRsharing, re3data}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
@ -12804,7 +12900,7 @@
" <td>[nan, dspace]</td>\n",
" <td>[nan, geoname_2_UA]</td>\n",
" <td>[nan, other]</td>\n",
" <td>[nan, [H1, D1, D204, BL, DK, D901, AC, BF, L1,...</td>\n",
" <td>[nan, [B1, BS, BL, AC, D204, DK, HM, BF, L1, D...</td>\n",
" <td>[nan, 2015-07-07 12:38:37]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
@ -12817,8 +12913,8 @@
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, [celestial, opendoar]]</td>\n",
" <td>[nan, [3410, 5883]]</td>\n",
" <td>[nan, [opendoar, celestial]]</td>\n",
" <td>[nan, [5883, 3410]]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
@ -13014,7 +13110,7 @@
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>{re3data, FAIRsharing}</td>\n",
" <td>{FAIRsharing, re3data}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
@ -13366,7 +13462,7 @@
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[[celestial, opendoar], nan]</td>\n",
" <td>[[opendoar, celestial], nan]</td>\n",
" <td>[[1426, 1294], nan]</td>\n",
" <td>[nan, nan]</td>\n",
" <td>[nan, nan]</td>\n",
@ -13991,7 +14087,7 @@
"\n",
" roar_subjects \\\n",
"0 [nan, nan] \n",
"1 [nan, [H1, D1, D204, BL, DK, D901, AC, BF, L1,... \n",
"1 [nan, [B1, BS, BL, AC, D204, DK, HM, BF, L1, D... \n",
"2 [nan, nan] \n",
"3 [nan, nan] \n",
"4 [nan, nan] \n",
@ -14033,10 +14129,10 @@
"\n",
" roar_registry_name roar_registry_id \\\n",
"0 [nan, nan] [nan, nan] \n",
"1 [nan, [celestial, opendoar]] [nan, [3410, 5883]] \n",
"1 [nan, [opendoar, celestial]] [nan, [5883, 3410]] \n",
"2 [nan, nan] [nan, nan] \n",
"3 [roarmap, nan] [http://roarmap.eprints.org/1046/, nan] \n",
"4 [[celestial, opendoar], nan] [[1426, 1294], nan] \n",
"4 [[opendoar, celestial], nan] [[1426, 1294], nan] \n",
"\n",
" roar_submit_to roar_submitted_to_name roar_submitted_to_done \\\n",
"0 [nan, nan] [nan, nan] [nan, nan] \n",
@ -14067,14 +14163,14 @@
"4 [nan, nan] [nan, nan] [roar_610, nan] \n",
"\n",
" source_set \n",
"0 {re3data, FAIRsharing} \n",
"0 {FAIRsharing, re3data} \n",
"1 {OpenDOAR, roar} \n",
"2 {re3data, FAIRsharing} \n",
"2 {FAIRsharing, re3data} \n",
"3 {OpenDOAR, roar} \n",
"4 {OpenDOAR, roar} "
]
},
"execution_count": 26,
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@ -14087,7 +14183,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [