From 74bb9edd04eb03b6d77f571fc271e8540b03ee19 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Wed, 6 Oct 2021 15:22:23 +0200 Subject: [PATCH] added simple checks for across registrations --- notebooks/03-overlap.ipynb | 372 +++++++++++++++++++++++-------------- 1 file changed, 234 insertions(+), 138 deletions(-) diff --git a/notebooks/03-overlap.ipynb b/notebooks/03-overlap.ipynb index 513c272..88cee61 100644 --- a/notebooks/03-overlap.ipynb +++ b/notebooks/03-overlap.ipynb @@ -1448,8 +1448,8 @@ " NaN\n", " NaN\n", " NaN\n", - " [celestial, opendoar]\n", - " [58, 669]\n", + " [opendoar, celestial]\n", + " [669, 58]\n", " NaN\n", " NaN\n", " NaN\n", @@ -1527,7 +1527,7 @@ " NaN\n", " NaN\n", " NaN\n", - " [celestial, opendoar]\n", + " [opendoar, celestial]\n", " [526, 258]\n", " NaN\n", " NaN\n", @@ -1663,8 +1663,8 @@ " TRUE\n", " TRUE\n", " TRUE\n", - " [Climate Service Center 2.0, KLIMZUG projects,...\n", - " [http://www.klimzug.de/de/94.php, http://www.c...\n", + " [Helmholtz-Zentrum Geesthacht, Climate Service...\n", + " [http://www.climateservicecenter.de/, http://w...\n", " de\n", " Hamburg\n", " 53.5511\n", @@ -1672,7 +1672,7 @@ " opus\n", " geoname_2_DE\n", " other\n", - " [GE, GF, S1, HD, G1]\n", + " [GE, S1, G1, HD, GF]\n", " 2015-07-02 08:08:31\n", " NaN\n", " NaN\n", @@ -1685,7 +1685,7 @@ " NaN\n", " NaN\n", " NaN\n", - " [celestial, opendoar]\n", + " [opendoar, celestial]\n", " [5881, 3408]\n", " NaN\n", " NaN\n", @@ -1893,14 +1893,14 @@ "0 NaN \n", "1 NaN \n", "2 NaN \n", - "3 [Climate Service Center 2.0, KLIMZUG projects,... \n", + "3 [Helmholtz-Zentrum Geesthacht, Climate Service... \n", "4 Skidmore College \n", "\n", " roar_organisation_home_page roar_location_country \\\n", "0 NaN fr \n", "1 NaN se \n", "2 NaN pt \n", - "3 [http://www.klimzug.de/de/94.php, http://www.c... de \n", + "3 [http://www.climateservicecenter.de/, http://w... de \n", "4 http://www.skidmore.edu/ us \n", "\n", " roar_location_city roar_location_latitude roar_location_longitude \\\n", @@ -1914,7 +1914,7 @@ "0 hal geoname_2_FR other NaN \n", "1 diva geoname_2_SE other NaN \n", "2 dspace geoname_2_PT other NaN \n", - "3 opus geoname_2_DE other [GE, GF, S1, HD, G1] \n", + "3 opus geoname_2_DE other [GE, S1, G1, HD, GF] \n", "4 bepress geoname_2_US other NaN \n", "\n", " roar_date roar_note roar_suggestions roar_activity_low \\\n", @@ -1946,10 +1946,10 @@ "4 NaN NaN NaN \n", "\n", " roar_registry_name roar_registry_id roar_submit_to \\\n", - "0 [celestial, opendoar] [58, 669] NaN \n", - "1 [celestial, opendoar] [526, 258] NaN \n", + "0 [opendoar, celestial] [669, 58] NaN \n", + "1 [opendoar, celestial] [526, 258] NaN \n", "2 NaN NaN NaN \n", - "3 [celestial, opendoar] [5881, 3408] NaN \n", + "3 [opendoar, celestial] [5881, 3408] NaN \n", "4 celestial 5882 NaN \n", "\n", " roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n", @@ -2163,7 +2163,7 @@ " dspace\n", " geoname_2_UA\n", " other\n", - " [H1, D1, D204, BL, DK, D901, AC, BF, L1, BS, H...\n", + " [B1, BS, BL, AC, D204, DK, HM, BF, L1, D1, H1,...\n", " 2015-07-07 12:38:37\n", " NaN\n", " NaN\n", @@ -2176,8 +2176,8 @@ " NaN\n", " NaN\n", " NaN\n", - " [celestial, opendoar]\n", - " [3410, 5883]\n", + " [opendoar, celestial]\n", + " [5883, 3410]\n", " NaN\n", " NaN\n", " NaN\n", @@ -2245,7 +2245,7 @@ "7 NaN NaN dspace geoname_2_UA \n", "\n", " roar_version roar_subjects \\\n", - "7 other [H1, D1, D204, BL, DK, D901, AC, BF, L1, BS, H... \n", + "7 other [B1, BS, BL, AC, D204, DK, HM, BF, L1, D1, H1,... \n", "\n", " roar_date roar_note roar_suggestions roar_activity_low \\\n", "7 2015-07-07 12:38:37 NaN NaN NaN \n", @@ -2257,7 +2257,7 @@ "7 NaN NaN NaN \n", "\n", " roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to \\\n", - "7 NaN [celestial, opendoar] [3410, 5883] NaN \n", + "7 NaN [opendoar, celestial] [5883, 3410] NaN \n", "\n", " roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n", "7 NaN NaN NaN \n", @@ -2417,15 +2417,6 @@ "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [ - "dup.groupby('dedup_id').aggregate(list).reset_index()[['dedup_id', 'name']].to_csv('../data/interim/to_validate.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, "outputs": [ { "data": { @@ -2517,7 +2508,7 @@ "freq 4 1977 1 " ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -2526,6 +2517,120 @@ "dup.describe()" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "dup_grouped = dup.groupby('dedup_id').aggregate(list)\n", + "dup_grouped['source_set'] = dup_grouped.source.map(set)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "duplicate_id 6\n", + "original_id 6\n", + "name 6\n", + "source 6\n", + "unique_id 6\n", + "source_set 6\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dup_grouped[dup_grouped.source_set.str.len() == 4].count()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "duplicate_id 60\n", + "original_id 60\n", + "name 60\n", + "source 60\n", + "unique_id 60\n", + "source_set 60\n", + "dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dup_grouped[dup_grouped.source_set.str.len() == 3].count()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "duplicate_id 1986\n", + "original_id 1986\n", + "name 1986\n", + "source 1986\n", + "unique_id 1986\n", + "source_set 1986\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dup_grouped[dup_grouped.source_set.str.len() == 2].count()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "duplicate_id 139\n", + "original_id 139\n", + "name 139\n", + "source 139\n", + "unique_id 139\n", + "source_set 139\n", + "dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dup_grouped[dup_grouped.source_set.str.len() == 1].count()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2535,7 +2640,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -2676,7 +2781,7 @@ "dedup::03e0704b5690a2dee1861dc3ad3316c9 {roar} " ] }, - "execution_count": 10, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -2690,7 +2795,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -2831,7 +2936,7 @@ "dedup::03e0704b5690a2dee1861dc3ad3316c9 roar " ] }, - "execution_count": 11, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -2843,7 +2948,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -2919,7 +3024,7 @@ "roar 121 121 121 121 121" ] }, - "execution_count": 12, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -2930,7 +3035,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -3097,7 +3202,7 @@ "[287 rows x 6 columns]" ] }, - "execution_count": 13, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -3116,7 +3221,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -3309,7 +3414,7 @@ "[440 rows x 6 columns]" ] }, - "execution_count": 14, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -3333,7 +3438,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -3526,7 +3631,7 @@ "[3890 rows x 6 columns]" ] }, - "execution_count": 15, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -3547,7 +3652,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -3562,7 +3667,7 @@ "dtype: int64" ] }, - "execution_count": 16, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -3573,7 +3678,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -3588,7 +3693,7 @@ "dtype: int64" ] }, - "execution_count": 17, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -3599,7 +3704,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -3608,7 +3713,7 @@ "2191" ] }, - "execution_count": 18, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -3619,7 +3724,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -3628,7 +3733,7 @@ "2191" ] }, - "execution_count": 19, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -3637,15 +3742,6 @@ "dup.groupby('dedup_id').ngroups" ] }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "dup_within.to_csv('../data/processed/dup_within.csv')" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -3655,7 +3751,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -4188,8 +4284,8 @@ " TRUE\n", " TRUE\n", " FALSE\n", - " [University of Sumatera Utara, USU Library]\n", - " [http://www.usu.ac.id, http://library.usu.ac.id]\n", + " [USU Library, University of Sumatera Utara]\n", + " [http://library.usu.ac.id, http://www.usu.ac.id]\n", " id\n", " Medan\n", " 3.5595\n", @@ -4210,8 +4306,8 @@ " NaN\n", " NaN\n", " NaN\n", - " [roarmap, celestial, opendoar]\n", - " [2101, 1717, 283]\n", + " [opendoar, roarmap, celestial]\n", + " [1717, 2101, 283]\n", " NaN\n", " NaN\n", " NaN\n", @@ -4379,7 +4475,7 @@ " dspace\n", " geoname_2_ZA\n", " other\n", - " [AS, B1, AI]\n", + " [B1, AI, AS]\n", " 2015-02-10 06:35:50\n", " NaN\n", " NaN\n", @@ -4574,7 +4670,7 @@ " 0\n", " 0\n", " 0\n", - " [celestial, opendoar]\n", + " [opendoar, celestial]\n", " [1779, 1627]\n", " NaN\n", " NaN\n", @@ -5205,14 +5301,14 @@ "\n", " roar_open_access roar_mandate roar_organisation_title \\\n", "0 FALSE TRUE Università degli Studi di Milano \n", - "1 TRUE FALSE [University of Sumatera Utara, USU Library] \n", + "1 TRUE FALSE [USU Library, University of Sumatera Utara] \n", "2 TRUE FALSE National Research Foundation of South Africa \n", "3 NaN NaN Swansea Metropolitan University \n", "4 FALSE FALSE NaN \n", "\n", " roar_organisation_home_page roar_location_country \\\n", "0 http://www.unimi.it it \n", - "1 [http://www.usu.ac.id, http://library.usu.ac.id] id \n", + "1 [http://library.usu.ac.id, http://www.usu.ac.id] id \n", "2 http://www.nrf.ac.za/ za \n", "3 http://www.smu.ac.uk/ gb \n", "4 NaN pe \n", @@ -5227,7 +5323,7 @@ " roar_software roar_geoname roar_version roar_subjects roar_date \\\n", "0 dspace geoname_2_IT other NaN 2014-05-04 17:40:53 \n", "1 dspace geoname_2_ID other NaN 2010-01-15 10:09:25 \n", - "2 dspace geoname_2_ZA other [AS, B1, AI] 2015-02-10 06:35:50 \n", + "2 dspace geoname_2_ZA other [B1, AI, AS] 2015-02-10 06:35:50 \n", "3 dspace geoname_2_GB other NaN 2008-05-15 11:29:17 \n", "4 dspace geoname_2_PE other NaN 2019-09-02 21:20:31 \n", "\n", @@ -5261,9 +5357,9 @@ "\n", " roar_registry_name roar_registry_id \\\n", "0 celestial 1596 \n", - "1 [roarmap, celestial, opendoar] [2101, 1717, 283] \n", + "1 [opendoar, roarmap, celestial] [1717, 2101, 283] \n", "2 roarmap NaN \n", - "3 [celestial, opendoar] [1779, 1627] \n", + "3 [opendoar, celestial] [1779, 1627] \n", "4 opendoar http://v2.sherpa.ac.uk/id/repository/4422 \n", "\n", " roar_submit_to roar_submitted_to_name roar_submitted_to_done \\\n", @@ -5295,7 +5391,7 @@ "4 NaN NaN roar_15142 " ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -5310,7 +5406,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -5865,7 +5961,7 @@ " NaN\n", " NaN\n", " NaN\n", - " [celestial, opendoar]\n", + " [opendoar, celestial]\n", " [1832, 1149]\n", " NaN\n", " NaN\n", @@ -6229,8 +6325,8 @@ " NaN\n", " NaN\n", " NaN\n", - " [celestial, opendoar]\n", - " [2545, 5072]\n", + " [opendoar, celestial]\n", + " [5072, 2545]\n", " NaN\n", " NaN\n", " NaN\n", @@ -6398,7 +6494,7 @@ " opus\n", " geoname_2_DE\n", " other\n", - " [GE, T1, HB]\n", + " [HB, GE, T1]\n", " 2016-04-28 13:58:38\n", " NaN\n", " please delete ID 5891\n", @@ -6411,8 +6507,8 @@ " NaN\n", " NaN\n", " NaN\n", - " [celestial, opendoar]\n", - " [2539, 6112]\n", + " [opendoar, celestial]\n", + " [6112, 2539]\n", " NaN\n", " NaN\n", " NaN\n", @@ -6926,7 +7022,7 @@ "1 other NaN 2005-06-07 12:57:08 NaN \n", "2 NaN NaN NaN NaN \n", "3 other NaN 2012-08-05 15:12:12 NaN \n", - "4 other [GE, T1, HB] 2016-04-28 13:58:38 NaN \n", + "4 other [HB, GE, T1] 2016-04-28 13:58:38 NaN \n", "\n", " roar_suggestions roar_activity_low roar_activity_medium \\\n", "0 NaN NaN NaN \n", @@ -6958,10 +7054,10 @@ "\n", " roar_registry_name roar_registry_id roar_submit_to \\\n", "0 NaN NaN NaN \n", - "1 [celestial, opendoar] [1832, 1149] NaN \n", + "1 [opendoar, celestial] [1832, 1149] NaN \n", "2 NaN NaN NaN \n", - "3 [celestial, opendoar] [2545, 5072] NaN \n", - "4 [celestial, opendoar] [2539, 6112] NaN \n", + "3 [opendoar, celestial] [5072, 2545] NaN \n", + "4 [opendoar, celestial] [6112, 2539] NaN \n", "\n", " roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank \\\n", "0 NaN NaN NaN \n", @@ -6992,7 +7088,7 @@ "4 NaN NaN roar_11212 " ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -7007,7 +7103,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -8668,7 +8764,7 @@ "4 NaN NaN " ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -8683,18 +8779,18 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - ":1: PerformanceWarning:\n", + ":1: PerformanceWarning:\n", "\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "\n", - ":2: PerformanceWarning:\n", + ":2: PerformanceWarning:\n", "\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "\n" @@ -9058,7 +9154,7 @@ " [dspace, dspace]\n", " [geoname_2_IN, geoname_2_IN]\n", " [other, other]\n", - " [[TK, TJ, TN, TH, TP, TD, TA], [T1, TA]]\n", + " [[TN, TA, TK, TH, TP, TD, TJ], [TA, T1]]\n", " [2011-12-15 09:01:35, 2012-01-05 12:09:37]\n", " [nan, nan]\n", " [nan, nan]\n", @@ -9241,7 +9337,7 @@ " [eprints, eprints]\n", " [geoname_2_IN, geoname_2_IN]\n", " [3.3.15 eps, 3.3.15 eps]\n", - " [[RB, RM], [R1, RZ]]\n", + " [[RM, RB], [R1, RZ]]\n", " [2014-03-07 15:07:45, 2014-03-19 07:05:04]\n", " [The National Institute for Research in Tuberc...\n", " [nan, Please include \"Tuberculosis\" as a Speci...\n", @@ -9254,7 +9350,7 @@ " [nan, nan]\n", " [nan, nan]\n", " [nan, nan]\n", - " [[celestial, opendoar], celestial]\n", + " [[opendoar, celestial], celestial]\n", " [[5410, 2725], 5430]\n", " [nan, nan]\n", " [nan, nan]\n", @@ -9437,7 +9533,7 @@ " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", - " [[celestial, opendoar], opendoar, opendoar]\n", + " [[opendoar, celestial], opendoar, opendoar]\n", " [[1781, 2426], 1781, 1807]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", @@ -9803,8 +9899,8 @@ " [nan, nan]\n", " [nan, nan]\n", " [nan, nan]\n", - " [[roarmap, celestial, opendoar], [celestial, o...\n", - " [[1441, 193, 1456], [1441, 1456]]\n", + " [[opendoar, roarmap, celestial], [opendoar, ce...\n", + " [[193, 1456, 1441], [1456, 1441]]\n", " [nan, nan]\n", " [nan, nan]\n", " [nan, nan]\n", @@ -10343,8 +10439,8 @@ "4 [geoname_2_HK, geoname_2_CN] [other, other] \n", "\n", " roar_subjects \\\n", - "0 [[TK, TJ, TN, TH, TP, TD, TA], [T1, TA]] \n", - "1 [[RB, RM], [R1, RZ]] \n", + "0 [[TN, TA, TK, TH, TP, TD, TJ], [TA, T1]] \n", + "1 [[RM, RB], [R1, RZ]] \n", "2 [nan, nan, nan] \n", "3 [nan, nan] \n", "4 [nan, nan] \n", @@ -10393,17 +10489,17 @@ "\n", " roar_registry_name \\\n", "0 [celestial, celestial] \n", - "1 [[celestial, opendoar], celestial] \n", - "2 [[celestial, opendoar], opendoar, opendoar] \n", + "1 [[opendoar, celestial], celestial] \n", + "2 [[opendoar, celestial], opendoar, opendoar] \n", "3 [celestial, celestial] \n", - "4 [[roarmap, celestial, opendoar], [celestial, o... \n", + "4 [[opendoar, roarmap, celestial], [opendoar, ce... \n", "\n", " roar_registry_id roar_submit_to roar_submitted_to_name \\\n", "0 [4790, 4789] [nan, nan] [nan, nan] \n", "1 [[5410, 2725], 5430] [nan, nan] [nan, nan] \n", "2 [[1781, 2426], 1781, 1807] [nan, nan, nan] [nan, nan, nan] \n", "3 [4715, 4715] [nan, nan] [nan, nan] \n", - "4 [[1441, 193, 1456], [1441, 1456]] [nan, nan] [nan, nan] \n", + "4 [[193, 1456, 1441], [1456, 1441]] [nan, nan] [nan, nan] \n", "\n", " roar_submitted_to_done roar_webometrics_rank roar_webometrics_size \\\n", "0 [nan, nan] [nan, nan] [nan, nan] \n", @@ -10434,7 +10530,7 @@ "4 [nan, nan] [roar_1019, roar_5550] {roar} " ] }, - "execution_count": 24, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -10447,18 +10543,18 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - ":1: PerformanceWarning:\n", + ":1: PerformanceWarning:\n", "\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "\n", - ":2: PerformanceWarning:\n", + ":2: PerformanceWarning:\n", "\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "\n" @@ -10822,7 +10918,7 @@ " [eprints, nan, nan]\n", " [geoname_2_MX, nan, nan]\n", " [3.3.15 eps, nan, nan]\n", - " [[H1, HX, GF, HC, HD, HT, HB, HM, G1, JA, T1, ...\n", + " [[HB, HD, HT, GF, HC, HF, HJ, HN, HX, H1, HG, ...\n", " [2012-02-03 05:18:16, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", @@ -10835,7 +10931,7 @@ " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", - " [[celestial, opendoar], nan, nan]\n", + " [[opendoar, celestial], nan, nan]\n", " [[4818, 2429], nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", @@ -11018,8 +11114,8 @@ " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", - " [nan, nan, [celestial, opendoar]]\n", - " [nan, nan, [5621, 3087]]\n", + " [nan, nan, [opendoar, celestial]]\n", + " [nan, nan, [3087, 5621]]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", @@ -11201,8 +11297,8 @@ " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", - " [nan, [celestial, opendoar], [celestial, opend...\n", - " [nan, [4672, 2318], [4672, 2318]]\n", + " [nan, [opendoar, celestial], [opendoar, celest...\n", + " [nan, [2318, 4672], [2318, 4672]]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", @@ -11384,7 +11480,7 @@ " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", - " [[celestial, opendoar], [celestial, opendoar],...\n", + " [[opendoar, celestial], [opendoar, celestial],...\n", " [[1509, 1430], [1509, 1430], nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", @@ -11567,8 +11663,8 @@ " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", - " [celestial, [celestial, opendoar], nan]\n", - " [4668, [2306, 4668], nan]\n", + " [celestial, [opendoar, celestial], nan]\n", + " [4668, [4668, 2306], nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", " [nan, nan, nan]\n", @@ -12156,7 +12252,7 @@ "4 [geoname_2_CN, geoname_2_CN, nan] [other, other, nan] \n", "\n", " roar_subjects \\\n", - "0 [[H1, HX, GF, HC, HD, HT, HB, HM, G1, JA, T1, ... \n", + "0 [[HB, HD, HT, GF, HC, HF, HJ, HN, HX, H1, HG, ... \n", "1 [nan, nan, nan] \n", "2 [nan, nan, nan] \n", "3 [nan, nan, nan] \n", @@ -12191,18 +12287,18 @@ "4 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] \n", "\n", " roar_fulltexts_rdocs roar_registry_name \\\n", - "0 [nan, nan, nan] [[celestial, opendoar], nan, nan] \n", - "1 [nan, nan, nan] [nan, nan, [celestial, opendoar]] \n", - "2 [nan, nan, nan] [nan, [celestial, opendoar], [celestial, opend... \n", - "3 [nan, nan, nan] [[celestial, opendoar], [celestial, opendoar],... \n", - "4 [nan, nan, nan] [celestial, [celestial, opendoar], nan] \n", + "0 [nan, nan, nan] [[opendoar, celestial], nan, nan] \n", + "1 [nan, nan, nan] [nan, nan, [opendoar, celestial]] \n", + "2 [nan, nan, nan] [nan, [opendoar, celestial], [opendoar, celest... \n", + "3 [nan, nan, nan] [[opendoar, celestial], [opendoar, celestial],... \n", + "4 [nan, nan, nan] [celestial, [opendoar, celestial], nan] \n", "\n", " roar_registry_id roar_submit_to roar_submitted_to_name \\\n", "0 [[4818, 2429], nan, nan] [nan, nan, nan] [nan, nan, nan] \n", - "1 [nan, nan, [5621, 3087]] [nan, nan, nan] [nan, nan, nan] \n", - "2 [nan, [4672, 2318], [4672, 2318]] [nan, nan, nan] [nan, nan, nan] \n", + "1 [nan, nan, [3087, 5621]] [nan, nan, nan] [nan, nan, nan] \n", + "2 [nan, [2318, 4672], [2318, 4672]] [nan, nan, nan] [nan, nan, nan] \n", "3 [[1509, 1430], [1509, 1430], nan] [nan, nan, nan] [nan, nan, nan] \n", - "4 [4668, [2306, 4668], nan] [nan, nan, nan] [nan, nan, nan] \n", + "4 [4668, [4668, 2306], nan] [nan, nan, nan] [nan, nan, nan] \n", "\n", " roar_submitted_to_done roar_webometrics_rank roar_webometrics_size \\\n", "0 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] \n", @@ -12233,7 +12329,7 @@ "4 [nan, nan, nan] [roar_4379, roar_4266, nan] {OpenDOAR, roar} " ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -12246,18 +12342,18 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - ":1: PerformanceWarning:\n", + ":1: PerformanceWarning:\n", "\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "\n", - ":2: PerformanceWarning:\n", + ":2: PerformanceWarning:\n", "\n", "DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", "\n" @@ -12648,7 +12744,7 @@ " [nan, nan]\n", " [nan, nan]\n", " [nan, nan]\n", - " {re3data, FAIRsharing}\n", + " {FAIRsharing, re3data}\n", " \n", " \n", " 1\n", @@ -12804,7 +12900,7 @@ " [nan, dspace]\n", " [nan, geoname_2_UA]\n", " [nan, other]\n", - " [nan, [H1, D1, D204, BL, DK, D901, AC, BF, L1,...\n", + " [nan, [B1, BS, BL, AC, D204, DK, HM, BF, L1, D...\n", " [nan, 2015-07-07 12:38:37]\n", " [nan, nan]\n", " [nan, nan]\n", @@ -12817,8 +12913,8 @@ " [nan, nan]\n", " [nan, nan]\n", " [nan, nan]\n", - " [nan, [celestial, opendoar]]\n", - " [nan, [3410, 5883]]\n", + " [nan, [opendoar, celestial]]\n", + " [nan, [5883, 3410]]\n", " [nan, nan]\n", " [nan, nan]\n", " [nan, nan]\n", @@ -13014,7 +13110,7 @@ " [nan, nan]\n", " [nan, nan]\n", " [nan, nan]\n", - " {re3data, FAIRsharing}\n", + " {FAIRsharing, re3data}\n", " \n", " \n", " 3\n", @@ -13366,7 +13462,7 @@ " [nan, nan]\n", " [nan, nan]\n", " [nan, nan]\n", - " [[celestial, opendoar], nan]\n", + " [[opendoar, celestial], nan]\n", " [[1426, 1294], nan]\n", " [nan, nan]\n", " [nan, nan]\n", @@ -13991,7 +14087,7 @@ "\n", " roar_subjects \\\n", "0 [nan, nan] \n", - "1 [nan, [H1, D1, D204, BL, DK, D901, AC, BF, L1,... \n", + "1 [nan, [B1, BS, BL, AC, D204, DK, HM, BF, L1, D... \n", "2 [nan, nan] \n", "3 [nan, nan] \n", "4 [nan, nan] \n", @@ -14033,10 +14129,10 @@ "\n", " roar_registry_name roar_registry_id \\\n", "0 [nan, nan] [nan, nan] \n", - "1 [nan, [celestial, opendoar]] [nan, [3410, 5883]] \n", + "1 [nan, [opendoar, celestial]] [nan, [5883, 3410]] \n", "2 [nan, nan] [nan, nan] \n", "3 [roarmap, nan] [http://roarmap.eprints.org/1046/, nan] \n", - "4 [[celestial, opendoar], nan] [[1426, 1294], nan] \n", + "4 [[opendoar, celestial], nan] [[1426, 1294], nan] \n", "\n", " roar_submit_to roar_submitted_to_name roar_submitted_to_done \\\n", "0 [nan, nan] [nan, nan] [nan, nan] \n", @@ -14067,14 +14163,14 @@ "4 [nan, nan] [nan, nan] [roar_610, nan] \n", "\n", " source_set \n", - "0 {re3data, FAIRsharing} \n", + "0 {FAIRsharing, re3data} \n", "1 {OpenDOAR, roar} \n", - "2 {re3data, FAIRsharing} \n", + "2 {FAIRsharing, re3data} \n", "3 {OpenDOAR, roar} \n", "4 {OpenDOAR, roar} " ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -14087,7 +14183,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [