From 98075dbae9baf731c2c69c30c1a3769dd526c7f1 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Fri, 8 Oct 2021 11:25:50 +0200 Subject: [PATCH] counting duplicates within --- notebooks/03-overlap.ipynb | 111 +++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 18 deletions(-) diff --git a/notebooks/03-overlap.ipynb b/notebooks/03-overlap.ipynb index 1fa3496..a1a13f6 100644 --- a/notebooks/03-overlap.ipynb +++ b/notebooks/03-overlap.ipynb @@ -2791,7 +2791,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -2849,19 +2849,20 @@ "sum 58 58 58 58 58" ] }, - "execution_count": 14, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()\n", - "opendoar_dup[opendoar_dup.duplicate_id > 1].aggregate(['count', 'sum'])" + "opendoar_dup = opendoar_dup[opendoar_dup.duplicate_id > 1]\n", + "opendoar_dup.aggregate(['count', 'sum'])" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -2919,19 +2920,20 @@ "sum 6 6 6 6 6" ] }, - "execution_count": 15, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()\n", - "re3data_dup[re3data_dup.duplicate_id > 1].aggregate(['count', 'sum'])" + "re3data_dup = re3data_dup[re3data_dup.duplicate_id > 1]\n", + "re3data_dup.aggregate(['count', 'sum'])" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -2989,40 +2991,113 @@ "sum 518 518 518 518 518" ] }, - "execution_count": 16, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()\n", - "roar_dup[roar_dup.duplicate_id > 1].aggregate(['count', 'sum'])" + "roar_dup = roar_dup[roar_dup.duplicate_id > 1]\n", + "roar_dup.aggregate(['count', 'sum'])" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 46, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
duplicate_idoriginal_idnamesourceunique_id
count00000
sum00000
\n", + "
" + ], "text/plain": [ - "duplicate_id 0\n", - "original_id 0\n", - "name 0\n", - "source 0\n", - "unique_id 0\n", - "dtype: int64" + " duplicate_id original_id name source unique_id\n", + "count 0 0 0 0 0\n", + "sum 0 0 0 0 0" ] }, - "execution_count": 17, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()\n", - "fairsharing_dup[fairsharing_dup.duplicate_id > 1].count()" + "fairsharing_dup = fairsharing_dup[fairsharing_dup.duplicate_id > 1]\n", + "fairsharing_dup.aggregate(['count', 'sum'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is one record which is counted twice. All ok: 2 records in roar and 2 in opendoar ['OpenDOAR_5226', 'roar_14929', 'OpenDOAR_3820', 'roar_16263']" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['dedup::5243958a762063341dc82d2bbf0f5f33'], dtype=object)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.intersect1d(roar_dup.index, opendoar_dup.index)" ] }, {