counting duplicates within

2021-10-08 10:50:42 +02:00 · 2021-10-08 10:50:42 +02:00 · a55db56e2e
parent 8f3175f792
commit a55db56e2e
2 changed files with 257 additions and 7151 deletions
--- a/notebooks/03-overlap.ipynb
+++ b/notebooks/03-overlap.ipynb
@ -25,6 +25,13 @@
    "pd.set_option('display.max_columns', None)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading data from registries"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 2,
@ -2615,6 +2622,13 @@
    "dup.describe()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Assessing duplicates across registries"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 9,
@ -2729,6 +2743,249 @@
    "dup_grouped[dup_grouped.source_set.str.len() == 1].count()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Assessing duplicates within registries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>duplicate_id</th>\n",
+       "      <th>original_id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>source</th>\n",
+       "      <th>unique_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>28</td>\n",
+       "      <td>28</td>\n",
+       "      <td>28</td>\n",
+       "      <td>28</td>\n",
+       "      <td>28</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sum</th>\n",
+       "      <td>58</td>\n",
+       "      <td>58</td>\n",
+       "      <td>58</td>\n",
+       "      <td>58</td>\n",
+       "      <td>58</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       duplicate_id  original_id  name  source  unique_id\n",
+       "count            28           28    28      28         28\n",
+       "sum              58           58    58      58         58"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()\n",
+    "opendoar_dup[opendoar_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>duplicate_id</th>\n",
+       "      <th>original_id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>source</th>\n",
+       "      <th>unique_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sum</th>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       duplicate_id  original_id  name  source  unique_id\n",
+       "count             3            3     3       3          3\n",
+       "sum               6            6     6       6          6"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()\n",
+    "re3data_dup[re3data_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>duplicate_id</th>\n",
+       "      <th>original_id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>source</th>\n",
+       "      <th>unique_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>249</td>\n",
+       "      <td>249</td>\n",
+       "      <td>249</td>\n",
+       "      <td>249</td>\n",
+       "      <td>249</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sum</th>\n",
+       "      <td>518</td>\n",
+       "      <td>518</td>\n",
+       "      <td>518</td>\n",
+       "      <td>518</td>\n",
+       "      <td>518</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       duplicate_id  original_id  name  source  unique_id\n",
+       "count           249          249   249     249        249\n",
+       "sum             518          518   518     518        518"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()\n",
+    "roar_dup[roar_dup.duplicate_id > 1].aggregate(['count', 'sum'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "duplicate_id    0\n",
+       "original_id     0\n",
+       "name            0\n",
+       "source          0\n",
+       "unique_id       0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()\n",
+    "fairsharing_dup[fairsharing_dup.duplicate_id > 1].count()"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
--- a/notebooks/[deprecated]-Explorative.ipynb
+++ b/notebooks/[deprecated]-Explorative.ipynb