counting duplicates within
This commit is contained in:
parent
cc2c004b9e
commit
98075dbae9
|
@ -2791,7 +2791,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 43,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -2849,19 +2849,20 @@
|
||||||
"sum 58 58 58 58 58"
|
"sum 58 58 58 58 58"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 14,
|
"execution_count": 43,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()\n",
|
"opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()\n",
|
||||||
"opendoar_dup[opendoar_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
|
"opendoar_dup = opendoar_dup[opendoar_dup.duplicate_id > 1]\n",
|
||||||
|
"opendoar_dup.aggregate(['count', 'sum'])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 44,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -2919,19 +2920,20 @@
|
||||||
"sum 6 6 6 6 6"
|
"sum 6 6 6 6 6"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 15,
|
"execution_count": 44,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()\n",
|
"re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()\n",
|
||||||
"re3data_dup[re3data_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
|
"re3data_dup = re3data_dup[re3data_dup.duplicate_id > 1]\n",
|
||||||
|
"re3data_dup.aggregate(['count', 'sum'])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 45,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -2989,40 +2991,113 @@
|
||||||
"sum 518 518 518 518 518"
|
"sum 518 518 518 518 518"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 16,
|
"execution_count": 45,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()\n",
|
"roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()\n",
|
||||||
"roar_dup[roar_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
|
"roar_dup = roar_dup[roar_dup.duplicate_id > 1]\n",
|
||||||
|
"roar_dup.aggregate(['count', 'sum'])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": 46,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>duplicate_id</th>\n",
|
||||||
|
" <th>original_id</th>\n",
|
||||||
|
" <th>name</th>\n",
|
||||||
|
" <th>source</th>\n",
|
||||||
|
" <th>unique_id</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>count</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>sum</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"duplicate_id 0\n",
|
" duplicate_id original_id name source unique_id\n",
|
||||||
"original_id 0\n",
|
"count 0 0 0 0 0\n",
|
||||||
"name 0\n",
|
"sum 0 0 0 0 0"
|
||||||
"source 0\n",
|
|
||||||
"unique_id 0\n",
|
|
||||||
"dtype: int64"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 17,
|
"execution_count": 46,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()\n",
|
"fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()\n",
|
||||||
"fairsharing_dup[fairsharing_dup.duplicate_id > 1].count()"
|
"fairsharing_dup = fairsharing_dup[fairsharing_dup.duplicate_id > 1]\n",
|
||||||
|
"fairsharing_dup.aggregate(['count', 'sum'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"There is one record which is counted twice. All ok: 2 records in roar and 2 in opendoar ['OpenDOAR_5226', 'roar_14929', 'OpenDOAR_3820', 'roar_16263']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 47,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"array(['dedup::5243958a762063341dc82d2bbf0f5f33'], dtype=object)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 47,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"np.intersect1d(roar_dup.index, opendoar_dup.index)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue