counting duplicates within
This commit is contained in:
parent
8f3175f792
commit
a55db56e2e
|
@ -25,6 +25,13 @@
|
|||
"pd.set_option('display.max_columns', None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Loading data from registries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
|
@ -2615,6 +2622,13 @@
|
|||
"dup.describe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Assessing duplicates across registries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
|
@ -2729,6 +2743,249 @@
|
|||
"dup_grouped[dup_grouped.source_set.str.len() == 1].count()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Assessing duplicates within registries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>duplicate_id</th>\n",
|
||||
" <th>original_id</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>source</th>\n",
|
||||
" <th>unique_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>28</td>\n",
|
||||
" <td>28</td>\n",
|
||||
" <td>28</td>\n",
|
||||
" <td>28</td>\n",
|
||||
" <td>28</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>sum</th>\n",
|
||||
" <td>58</td>\n",
|
||||
" <td>58</td>\n",
|
||||
" <td>58</td>\n",
|
||||
" <td>58</td>\n",
|
||||
" <td>58</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" duplicate_id original_id name source unique_id\n",
|
||||
"count 28 28 28 28 28\n",
|
||||
"sum 58 58 58 58 58"
|
||||
]
|
||||
},
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()\n",
|
||||
"opendoar_dup[opendoar_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>duplicate_id</th>\n",
|
||||
" <th>original_id</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>source</th>\n",
|
||||
" <th>unique_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>sum</th>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" duplicate_id original_id name source unique_id\n",
|
||||
"count 3 3 3 3 3\n",
|
||||
"sum 6 6 6 6 6"
|
||||
]
|
||||
},
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()\n",
|
||||
"re3data_dup[re3data_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>duplicate_id</th>\n",
|
||||
" <th>original_id</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>source</th>\n",
|
||||
" <th>unique_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>249</td>\n",
|
||||
" <td>249</td>\n",
|
||||
" <td>249</td>\n",
|
||||
" <td>249</td>\n",
|
||||
" <td>249</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>sum</th>\n",
|
||||
" <td>518</td>\n",
|
||||
" <td>518</td>\n",
|
||||
" <td>518</td>\n",
|
||||
" <td>518</td>\n",
|
||||
" <td>518</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" duplicate_id original_id name source unique_id\n",
|
||||
"count 249 249 249 249 249\n",
|
||||
"sum 518 518 518 518 518"
|
||||
]
|
||||
},
|
||||
"execution_count": 63,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()\n",
|
||||
"roar_dup[roar_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"duplicate_id 0\n",
|
||||
"original_id 0\n",
|
||||
"name 0\n",
|
||||
"source 0\n",
|
||||
"unique_id 0\n",
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()\n",
|
||||
"fairsharing_dup[fairsharing_dup.duplicate_id > 1].count()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue