new dedup file

This commit is contained in:
Andrea Mannocci 2021-09-22 11:59:30 +02:00
parent 7ab83cbb10
commit 6abcd9b142
5 changed files with 4786 additions and 30066 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -26,7 +26,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 19,
"metadata": {},
"outputs": [
{
@ -59,84 +59,36 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>dedup::29a83a8a9641bb860a679d7e5ba52d26</td>\n",
" <td>14174</td>\n",
" <td>14174</td>\n",
" <td>OHIO Open Library | Ohio University Research</td>\n",
" <th>count</th>\n",
" <td>4513</td>\n",
" <td>4513</td>\n",
" <td>4513</td>\n",
" <td>4513</td>\n",
" <td>4513</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>2145</td>\n",
" <td>4513</td>\n",
" <td>4261</td>\n",
" <td>3894</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>dedup::75e33da9b103b7b91dcd8da0abe1354b</td>\n",
" <td>https://fairsharing.org/bsg-d001520</td>\n",
" <td>3860</td>\n",
" <td>UPN JATIM REPOSITORY</td>\n",
" <td>roar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>dedup::000871c1fc726f0b52dc86a4eeb027de</td>\n",
" <td>4612</td>\n",
" <td>4612</td>\n",
" <td>IIT Bombay Institutional Repository</td>\n",
" <td>roar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>dedup::000871c1fc726f0b52dc86a4eeb027de</td>\n",
" <td>4649</td>\n",
" <td>4649</td>\n",
" <td>IIT Bombay Institutional Repository</td>\n",
" <td>roar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>dedup::001e6d882e54c780ce269d3c46997287</td>\n",
" <td>re3data_____::4af9fe2bb93511a5e0f0c39e94d6557f</td>\n",
" <td>r3d100011306</td>\n",
" <td>RESID Database of Protein Modifications</td>\n",
" <td>re3data</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>dedup::001e6d882e54c780ce269d3c46997287</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.q...</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.q...</td>\n",
" <td>RESID Database of Protein Modifications</td>\n",
" <td>FAIRsharing</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>dedup::0023a1e3447fdb31836536cc903f1310</td>\n",
" <td>opendoar____::c6f798b844366ccd65d99bc7f31e0e02</td>\n",
" <td>3410</td>\n",
" <td>erucu: electronic repository of the ukrainian ...</td>\n",
" <td>OpenDOAR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>dedup::0023a1e3447fdb31836536cc903f1310</td>\n",
" <td>10013</td>\n",
" <td>10013</td>\n",
" <td>ErUCU: Electronic repository of the Ukrainian ...</td>\n",
" <td>roar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>dedup::003ab6b40af9b488decea7c582d150a2</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.d...</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.d...</td>\n",
" <td>Synapse</td>\n",
" <td>FAIRsharing</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>dedup::003ab6b40af9b488decea7c582d150a2</td>\n",
" <td>re3data_____::cafc5d99b7c187e24b40d958a16a91f1</td>\n",
" <td>r3d100011894</td>\n",
" <td>Synapse</td>\n",
" <td>re3data</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>dedup::0064f599ed0adb5870a5b3ffe438e485</td>\n",
" <td>16034</td>\n",
" <td>16034</td>\n",
" <td>Giresun University Institutional Repository</td>\n",
" <td>roar</td>\n",
" <th>freq</th>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>1933</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@ -144,210 +96,37 @@
],
"text/plain": [
" dedup_id \\\n",
"0 dedup::29a83a8a9641bb860a679d7e5ba52d26 \n",
"1 dedup::000871c1fc726f0b52dc86a4eeb027de \n",
"2 dedup::000871c1fc726f0b52dc86a4eeb027de \n",
"3 dedup::001e6d882e54c780ce269d3c46997287 \n",
"4 dedup::001e6d882e54c780ce269d3c46997287 \n",
"5 dedup::0023a1e3447fdb31836536cc903f1310 \n",
"6 dedup::0023a1e3447fdb31836536cc903f1310 \n",
"7 dedup::003ab6b40af9b488decea7c582d150a2 \n",
"8 dedup::003ab6b40af9b488decea7c582d150a2 \n",
"9 dedup::0064f599ed0adb5870a5b3ffe438e485 \n",
"count 4513 \n",
"unique 2145 \n",
"top dedup::75e33da9b103b7b91dcd8da0abe1354b \n",
"freq 5 \n",
"\n",
" duplicate_id \\\n",
"0 14174 \n",
"1 4612 \n",
"2 4649 \n",
"3 re3data_____::4af9fe2bb93511a5e0f0c39e94d6557f \n",
"4 https://fairsharing.org/10.25504/FAIRsharing.q... \n",
"5 opendoar____::c6f798b844366ccd65d99bc7f31e0e02 \n",
"6 10013 \n",
"7 https://fairsharing.org/10.25504/FAIRsharing.d... \n",
"8 re3data_____::cafc5d99b7c187e24b40d958a16a91f1 \n",
"9 16034 \n",
" duplicate_id original_id name \\\n",
"count 4513 4513 4513 \n",
"unique 4513 4261 3894 \n",
"top https://fairsharing.org/bsg-d001520 3860 UPN JATIM REPOSITORY \n",
"freq 1 2 4 \n",
"\n",
" original_id \\\n",
"0 14174 \n",
"1 4612 \n",
"2 4649 \n",
"3 r3d100011306 \n",
"4 https://fairsharing.org/10.25504/FAIRsharing.q... \n",
"5 3410 \n",
"6 10013 \n",
"7 https://fairsharing.org/10.25504/FAIRsharing.d... \n",
"8 r3d100011894 \n",
"9 16034 \n",
"\n",
" name source \n",
"0 OHIO Open Library | Ohio University Research roar \n",
"1 IIT Bombay Institutional Repository roar \n",
"2 IIT Bombay Institutional Repository roar \n",
"3 RESID Database of Protein Modifications re3data \n",
"4 RESID Database of Protein Modifications FAIRsharing \n",
"5 erucu: electronic repository of the ukrainian ... OpenDOAR \n",
"6 ErUCU: Electronic repository of the Ukrainian ... roar \n",
"7 Synapse FAIRsharing \n",
"8 Synapse re3data \n",
"9 Giresun University Institutional Repository roar "
" source \n",
"count 4513 \n",
"unique 4 \n",
"top roar \n",
"freq 1933 "
]
},
"execution_count": 46,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_09 = pd.read_csv('../data/interim/ds_dedup09.csv', sep=';')\n",
"df_09.head(10)"
"df_09 = pd.read_csv('../data/interim/ds_dedup09.csv', sep=';', quotechar='\"', header=None, names=['dedup_id', 'duplicate_id', 'original_id', 'name', 'source'])\n",
"df_09.describe()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>dedup_id</th>\n",
" <th>duplicate_id</th>\n",
" <th>original_id</th>\n",
" <th>name</th>\n",
" <th>source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5115</th>\n",
" <td>dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98</td>\n",
" <td>1126</td>\n",
" <td>1126</td>\n",
" <td>RIT Digital Media Library</td>\n",
" <td>roar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5116</th>\n",
" <td>dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98</td>\n",
" <td>opendoar____::443cb001c138b2561a0d90720d6ce111</td>\n",
" <td>648</td>\n",
" <td>rit digital media library</td>\n",
" <td>OpenDOAR</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" dedup_id \\\n",
"5115 dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98 \n",
"5116 dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98 \n",
"\n",
" duplicate_id original_id \\\n",
"5115 1126 1126 \n",
"5116 opendoar____::443cb001c138b2561a0d90720d6ce111 648 \n",
"\n",
" name source \n",
"5115 RIT Digital Media Library roar \n",
"5116 rit digital media library OpenDOAR "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_09[df_09.dedup_id == 'dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98']"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>dedup_id</th>\n",
" <th>duplicate_id</th>\n",
" <th>original_id</th>\n",
" <th>name</th>\n",
" <th>source</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>dedup::01846ae470651e97d2f73fce979406a9</td>\n",
" <td>opendoar____::b4d6f2b565ca0eef1f9245403aac366a</td>\n",
" <td>7668</td>\n",
" <td>digital commons at michigan state university c...</td>\n",
" <td>OpenDOAR</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" dedup_id \\\n",
"40 dedup::01846ae470651e97d2f73fce979406a9 \n",
"\n",
" duplicate_id original_id \\\n",
"40 opendoar____::b4d6f2b565ca0eef1f9245403aac366a 7668 \n",
"\n",
" name source \n",
"40 digital commons at michigan state university c... OpenDOAR "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_09[df_09.dedup_id == 'dedup::01846ae470651e97d2f73fce979406a9']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@ -365,7 +144,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -380,7 +159,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@ -389,7 +168,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@ -400,7 +179,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 17,
"metadata": {},
"outputs": [
{
@ -434,43 +213,43 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>dedup::29a83a8a9641bb860a679d7e5ba52d26</td>\n",
" <td>14174</td>\n",
" <td>14174</td>\n",
" <td>OHIO Open Library | Ohio University Research</td>\n",
" <td>roar</td>\n",
" <td>dedup::252773ebafcbbac75238b419d964068e</td>\n",
" <td>https://fairsharing.org/bsg-d001520</td>\n",
" <td>https://fairsharing.org/bsg-d001520</td>\n",
" <td>ACTRIS Data Centre</td>\n",
" <td>FAIRsharing</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>dedup::000871c1fc726f0b52dc86a4eeb027de</td>\n",
" <td>4612</td>\n",
" <td>4612</td>\n",
" <td>IIT Bombay Institutional Repository</td>\n",
" <td>roar</td>\n",
" <td>dedup::860320be12a1c050cd7731794e231bd3</td>\n",
" <td>opendoar____::2290a7385ed77cc5592dc2153229f082</td>\n",
" <td>1064</td>\n",
" <td>oxford university research archive</td>\n",
" <td>OpenDOAR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>dedup::000871c1fc726f0b52dc86a4eeb027de</td>\n",
" <td>4649</td>\n",
" <td>4649</td>\n",
" <td>IIT Bombay Institutional Repository</td>\n",
" <td>roar</td>\n",
" <td>dedup::1aa7a8773e6a7fdacbcedf9999009a38</td>\n",
" <td>opendoar____::191f8f858acda435ae0daf994e2a72c2</td>\n",
" <td>8648</td>\n",
" <td>digital commons@georgia southern</td>\n",
" <td>OpenDOAR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>dedup::001e6d882e54c780ce269d3c46997287</td>\n",
" <td>re3data_____::4af9fe2bb93511a5e0f0c39e94d6557f</td>\n",
" <td>r3d100011306</td>\n",
" <td>RESID Database of Protein Modifications</td>\n",
" <td>re3data</td>\n",
" <td>dedup::4801441f041958afaca324c43c40787b</td>\n",
" <td>10453</td>\n",
" <td>10453</td>\n",
" <td>MCStor</td>\n",
" <td>roar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>dedup::001e6d882e54c780ce269d3c46997287</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.q...</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.q...</td>\n",
" <td>RESID Database of Protein Modifications</td>\n",
" <td>FAIRsharing</td>\n",
" <td>dedup::2841194266115ac1cc04d19630cde46b</td>\n",
" <td>re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5</td>\n",
" <td>r3d100011189</td>\n",
" <td>PRISM: University of Calgary's Digital Repository</td>\n",
" <td>re3data</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@ -478,35 +257,35 @@
],
"text/plain": [
" dedup_id \\\n",
"0 dedup::29a83a8a9641bb860a679d7e5ba52d26 \n",
"1 dedup::000871c1fc726f0b52dc86a4eeb027de \n",
"2 dedup::000871c1fc726f0b52dc86a4eeb027de \n",
"3 dedup::001e6d882e54c780ce269d3c46997287 \n",
"4 dedup::001e6d882e54c780ce269d3c46997287 \n",
"0 dedup::252773ebafcbbac75238b419d964068e \n",
"1 dedup::860320be12a1c050cd7731794e231bd3 \n",
"2 dedup::1aa7a8773e6a7fdacbcedf9999009a38 \n",
"3 dedup::4801441f041958afaca324c43c40787b \n",
"4 dedup::2841194266115ac1cc04d19630cde46b \n",
"\n",
" duplicate_id \\\n",
"0 14174 \n",
"1 4612 \n",
"2 4649 \n",
"3 re3data_____::4af9fe2bb93511a5e0f0c39e94d6557f \n",
"4 https://fairsharing.org/10.25504/FAIRsharing.q... \n",
"0 https://fairsharing.org/bsg-d001520 \n",
"1 opendoar____::2290a7385ed77cc5592dc2153229f082 \n",
"2 opendoar____::191f8f858acda435ae0daf994e2a72c2 \n",
"3 10453 \n",
"4 re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5 \n",
"\n",
" original_id \\\n",
"0 14174 \n",
"1 4612 \n",
"2 4649 \n",
"3 r3d100011306 \n",
"4 https://fairsharing.org/10.25504/FAIRsharing.q... \n",
"0 https://fairsharing.org/bsg-d001520 \n",
"1 1064 \n",
"2 8648 \n",
"3 10453 \n",
"4 r3d100011189 \n",
"\n",
" name source \n",
"0 OHIO Open Library | Ohio University Research roar \n",
"1 IIT Bombay Institutional Repository roar \n",
"2 IIT Bombay Institutional Repository roar \n",
"3 RESID Database of Protein Modifications re3data \n",
"4 RESID Database of Protein Modifications FAIRsharing "
"0 ACTRIS Data Centre FAIRsharing \n",
"1 oxford university research archive OpenDOAR \n",
"2 digital commons@georgia southern OpenDOAR \n",
"3 MCStor roar \n",
"4 PRISM: University of Calgary's Digital Repository re3data "
]
},
"execution_count": 7,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@ -517,7 +296,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 44,
"metadata": {},
"outputs": [
{
@ -545,6 +324,7 @@
" <th>original_id</th>\n",
" <th>name</th>\n",
" <th>source</th>\n",
" <th>source_set</th>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup_id</th>\n",
@ -552,6 +332,7 @@
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
@ -561,6 +342,7 @@
" <td>[4612, 4649]</td>\n",
" <td>[IIT Bombay Institutional Repository, IIT Bomb...</td>\n",
" <td>[roar, roar]</td>\n",
" <td>{roar}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::001e6d882e54c780ce269d3c46997287</th>\n",
@ -568,6 +350,7 @@
" <td>[r3d100011306, https://fairsharing.org/10.2550...</td>\n",
" <td>[RESID Database of Protein Modifications, RESI...</td>\n",
" <td>[re3data, FAIRsharing]</td>\n",
" <td>{re3data, FAIRsharing}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::0023a1e3447fdb31836536cc903f1310</th>\n",
@ -575,6 +358,7 @@
" <td>[3410, 10013]</td>\n",
" <td>[erucu: electronic repository of the ukrainian...</td>\n",
" <td>[OpenDOAR, roar]</td>\n",
" <td>{OpenDOAR, roar}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::003ab6b40af9b488decea7c582d150a2</th>\n",
@ -582,6 +366,7 @@
" <td>[https://fairsharing.org/10.25504/FAIRsharing....</td>\n",
" <td>[Synapse, Synapse]</td>\n",
" <td>[FAIRsharing, re3data]</td>\n",
" <td>{re3data, FAIRsharing}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::0064f599ed0adb5870a5b3ffe438e485</th>\n",
@ -589,6 +374,7 @@
" <td>[16034, 9647]</td>\n",
" <td>[Giresun University Institutional Repository, ...</td>\n",
" <td>[roar, OpenDOAR]</td>\n",
" <td>{OpenDOAR, roar}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
@ -596,6 +382,7 @@
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::ff49cc40a8890e6a60f40ff3026d2730</th>\n",
@ -603,6 +390,7 @@
" <td>[1333, 1389]</td>\n",
" <td>[UnissResearch, unissresearch]</td>\n",
" <td>[roar, OpenDOAR]</td>\n",
" <td>{OpenDOAR, roar}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::ff4d70de478038c72282b7e4af1d4260</th>\n",
@ -610,6 +398,7 @@
" <td>[9752, 16367]</td>\n",
" <td>[european xfel publication database, European ...</td>\n",
" <td>[OpenDOAR, roar]</td>\n",
" <td>{OpenDOAR, roar}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::ff826ce6ee85809389f18a5fafe72366</th>\n",
@ -617,6 +406,7 @@
" <td>[3601, 2608]</td>\n",
" <td>[electronic odessa national economic universit...</td>\n",
" <td>[OpenDOAR, OpenDOAR]</td>\n",
" <td>{OpenDOAR}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::ffbd6cbb019a1413183c8d08f2929307</th>\n",
@ -624,6 +414,7 @@
" <td>[3108, 1912]</td>\n",
" <td>[Fotografía Sobre España en el Siglo XIX, foto...</td>\n",
" <td>[roar, OpenDOAR]</td>\n",
" <td>{OpenDOAR, roar}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98</th>\n",
@ -631,10 +422,11 @@
" <td>[1126, 648]</td>\n",
" <td>[RIT Digital Media Library, rit digital media ...</td>\n",
" <td>[roar, OpenDOAR]</td>\n",
" <td>{OpenDOAR, roar}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2453 rows × 4 columns</p>\n",
"<p>2145 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
@ -680,7 +472,7 @@
"dedup::ffbd6cbb019a1413183c8d08f2929307 [Fotografía Sobre España en el Siglo XIX, foto... \n",
"dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98 [RIT Digital Media Library, rit digital media ... \n",
"\n",
" source \n",
" source \\\n",
"dedup_id \n",
"dedup::000871c1fc726f0b52dc86a4eeb027de [roar, roar] \n",
"dedup::001e6d882e54c780ce269d3c46997287 [re3data, FAIRsharing] \n",
@ -694,22 +486,37 @@
"dedup::ffbd6cbb019a1413183c8d08f2929307 [roar, OpenDOAR] \n",
"dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98 [roar, OpenDOAR] \n",
"\n",
"[2453 rows x 4 columns]"
" source_set \n",
"dedup_id \n",
"dedup::000871c1fc726f0b52dc86a4eeb027de {roar} \n",
"dedup::001e6d882e54c780ce269d3c46997287 {re3data, FAIRsharing} \n",
"dedup::0023a1e3447fdb31836536cc903f1310 {OpenDOAR, roar} \n",
"dedup::003ab6b40af9b488decea7c582d150a2 {re3data, FAIRsharing} \n",
"dedup::0064f599ed0adb5870a5b3ffe438e485 {OpenDOAR, roar} \n",
"... ... \n",
"dedup::ff49cc40a8890e6a60f40ff3026d2730 {OpenDOAR, roar} \n",
"dedup::ff4d70de478038c72282b7e4af1d4260 {OpenDOAR, roar} \n",
"dedup::ff826ce6ee85809389f18a5fafe72366 {OpenDOAR} \n",
"dedup::ffbd6cbb019a1413183c8d08f2929307 {OpenDOAR, roar} \n",
"dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98 {OpenDOAR, roar} \n",
"\n",
"[2145 rows x 5 columns]"
]
},
"execution_count": 8,
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup = df_09.groupby('dedup_id').aggregate(list)\n",
"dup['source_set'] = dup.source.map(set)\n",
"dup"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 45,
"metadata": {},
"outputs": [
{
@ -737,6 +544,7 @@
" <th>original_id</th>\n",
" <th>name</th>\n",
" <th>source</th>\n",
" <th>source_set</th>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup_id</th>\n",
@ -744,43 +552,49 @@
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>dedup::01846ae470651e97d2f73fce979406a9</th>\n",
" <td>[opendoar____::b4d6f2b565ca0eef1f9245403aac366a]</td>\n",
" <td>[7668]</td>\n",
" <td>[digital commons at michigan state university ...</td>\n",
" <td>[OpenDOAR]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::022036087426786cfd0f7f41fa7a2665</th>\n",
" <th>dedup::06138bc5af6023646ede0e1f7c1eac75</th>\n",
" <td>[https://fairsharing.org/10.25504/FAIRsharing....</td>\n",
" <td>[https://fairsharing.org/10.25504/FAIRsharing....</td>\n",
" <td>[World Data Center for Climate at DRKZ]</td>\n",
" <td>[FAIRsharing]</td>\n",
" <td>[Crystallography Open Database, Crystallograph...</td>\n",
" <td>[FAIRsharing, roar, re3data, OpenDOAR]</td>\n",
" <td>{re3data, OpenDOAR, roar, FAIRsharing}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::07e8b472e1e4af17a6b20ce083baf29f</th>\n",
" <td>[15036]</td>\n",
" <td>[15036]</td>\n",
" <td>[MiCISAN]</td>\n",
" <td>[roar]</td>\n",
" <th>dedup::0b7e684c89e746c67c9761ce2b65479c</th>\n",
" <td>[re3data_____::44217da669f17a260c0958a679003a7...</td>\n",
" <td>[r3d100010423, 375, https://fairsharing.org/10...</td>\n",
" <td>[Woods Hole Open Access Server, woods hole ope...</td>\n",
" <td>[re3data, OpenDOAR, FAIRsharing]</td>\n",
" <td>{re3data, FAIRsharing, OpenDOAR}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::0894634a3244e3050d8057a453e17e57</th>\n",
" <td>[https://fairsharing.org/10.25504/FAIRsharing....</td>\n",
" <td>[https://fairsharing.org/10.25504/FAIRsharing....</td>\n",
" <td>[European Variation Archive]</td>\n",
" <td>[FAIRsharing]</td>\n",
" <th>dedup::0bb4aec1710521c12ee76289d9440817</th>\n",
" <td>[re3data_____::eb721a14697a05c477d0ae23830e665...</td>\n",
" <td>[r3d100012596, 650, 349]</td>\n",
" <td>[Digitale Bibliothek Thüringen, digitale bibli...</td>\n",
" <td>[re3data, OpenDOAR, roar]</td>\n",
" <td>{re3data, roar, OpenDOAR}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::0a54b19a13b6712dc04d1b49215423d8</th>\n",
" <td>[opendoar____::d34ab169b70c9dcd35e62896010cd9ff]</td>\n",
" <td>[377]</td>\n",
" <td>[yale medicine thesis digital library]</td>\n",
" <td>[OpenDOAR]</td>\n",
" <th>dedup::139042a4157a773f209847829d80894d</th>\n",
" <td>[756, re3data_____::a95b34b344dc049963c35997fe...</td>\n",
" <td>[756, r3d100010690, 1330, 5487]</td>\n",
" <td>[Khazar University Institutional Repository, K...</td>\n",
" <td>[roar, re3data, OpenDOAR, roar]</td>\n",
" <td>{re3data, roar, OpenDOAR}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::13d4bfa0321f86f042b34ec79064b316</th>\n",
" <td>[opendoar____::e3844e186e6eb8736e9f53c0c588952...</td>\n",
" <td>[9411, 15255, r3d100013135]</td>\n",
" <td>[fordatis, Fordatis, Fordatis]</td>\n",
" <td>[OpenDOAR, roar, re3data]</td>\n",
" <td>{OpenDOAR, re3data, roar}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
@ -788,143 +602,139 @@
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::f8306c8f16096b6d944799f4d427a976</th>\n",
" <td>[re3data_____::574b553c6c374d597d2068ab2b117889]</td>\n",
" <td>[r3d100012041]</td>\n",
" <td>[Canadian Disaster Database]</td>\n",
" <td>[re3data]</td>\n",
" <th>dedup::eded0708dfe855304a50029fccf1a677</th>\n",
" <td>[opendoar____::eccbc87e4b5ce2fe28308fd9f2a7baf...</td>\n",
" <td>[3, r3d100012604, 5509]</td>\n",
" <td>[ams acta, AMS Acta, AMS Acta]</td>\n",
" <td>[OpenDOAR, re3data, roar]</td>\n",
" <td>{OpenDOAR, roar, re3data}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::f9d8e2daaa9144310b66bf948e50d656</th>\n",
" <td>[re3data_____::95014789f83d7611ebfddace19d0523a]</td>\n",
" <td>[r3d100011045]</td>\n",
" <td>[Index to Marine &amp; Lacustrine Geological Samples]</td>\n",
" <td>[re3data]</td>\n",
" <th>dedup::ef46a43afd7c7d67e21f4306bb1364e9</th>\n",
" <td>[opendoar____::1f74a54f39b3123ad272ca0a06e7463...</td>\n",
" <td>[5870, https://fairsharing.org/10.25504/FAIRsh...</td>\n",
" <td>[heidata, heiDATA, heiDATA]</td>\n",
" <td>[OpenDOAR, FAIRsharing, re3data]</td>\n",
" <td>{OpenDOAR, re3data, FAIRsharing}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::fcdbc4f504a15df8f78da88ee72fad32</th>\n",
" <td>[opendoar____::9f96f36b7aae3b1ff847c26ac94c604e]</td>\n",
" <td>[4979]</td>\n",
" <td>[university of minnesota law school]</td>\n",
" <td>[OpenDOAR]</td>\n",
" <th>dedup::f296bb3903d8a84d81c47e6db90764b9</th>\n",
" <td>[https://fairsharing.org/10.25504/FAIRsharing....</td>\n",
" <td>[https://fairsharing.org/10.25504/FAIRsharing....</td>\n",
" <td>[PubChem, PubChem, pubchem]</td>\n",
" <td>[FAIRsharing, re3data, OpenDOAR]</td>\n",
" <td>{re3data, OpenDOAR, FAIRsharing}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::fcfe9c770eb9372e6961a17f7eaffd5f</th>\n",
" <td>[4637]</td>\n",
" <td>[4637]</td>\n",
" <td>[Simon Fraser University Institutional Reposit...</td>\n",
" <td>[roar]</td>\n",
" <th>dedup::f3dc6512e46961c363ea402ff218c8fb</th>\n",
" <td>[re3data_____::b2fc675049cbf485d9abbccf5232a31...</td>\n",
" <td>[r3d100012538, 10171, https://fairsharing.org/...</td>\n",
" <td>[DataverseNO, dataverseno, DataverseNO]</td>\n",
" <td>[re3data, OpenDOAR, FAIRsharing]</td>\n",
" <td>{re3data, FAIRsharing, OpenDOAR}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>dedup::fe73f687e5bc5280214e0486b273a5f9</th>\n",
" <td>[330]</td>\n",
" <td>[330]</td>\n",
" <td>[DigitalCommons@Fort Lewis College: Scholarshi...</td>\n",
" <td>[roar]</td>\n",
" <th>dedup::f9aa64cbb57131939eda048250f2dbae</th>\n",
" <td>[re3data_____::063765fa6d6358d62ea2d41dde32d3a...</td>\n",
" <td>[r3d100012692, https://fairsharing.org/10.2550...</td>\n",
" <td>[Scholars' Mine, Scholars' Mine, scholars mine]</td>\n",
" <td>[re3data, FAIRsharing, OpenDOAR]</td>\n",
" <td>{re3data, OpenDOAR, FAIRsharing}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>109 rows × 4 columns</p>\n",
"<p>65 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" duplicate_id \\\n",
"dedup_id \n",
"dedup::01846ae470651e97d2f73fce979406a9 [opendoar____::b4d6f2b565ca0eef1f9245403aac366a] \n",
"dedup::022036087426786cfd0f7f41fa7a2665 [https://fairsharing.org/10.25504/FAIRsharing.... \n",
"dedup::07e8b472e1e4af17a6b20ce083baf29f [15036] \n",
"dedup::0894634a3244e3050d8057a453e17e57 [https://fairsharing.org/10.25504/FAIRsharing.... \n",
"dedup::0a54b19a13b6712dc04d1b49215423d8 [opendoar____::d34ab169b70c9dcd35e62896010cd9ff] \n",
"dedup::06138bc5af6023646ede0e1f7c1eac75 [https://fairsharing.org/10.25504/FAIRsharing.... \n",
"dedup::0b7e684c89e746c67c9761ce2b65479c [re3data_____::44217da669f17a260c0958a679003a7... \n",
"dedup::0bb4aec1710521c12ee76289d9440817 [re3data_____::eb721a14697a05c477d0ae23830e665... \n",
"dedup::139042a4157a773f209847829d80894d [756, re3data_____::a95b34b344dc049963c35997fe... \n",
"dedup::13d4bfa0321f86f042b34ec79064b316 [opendoar____::e3844e186e6eb8736e9f53c0c588952... \n",
"... ... \n",
"dedup::f8306c8f16096b6d944799f4d427a976 [re3data_____::574b553c6c374d597d2068ab2b117889] \n",
"dedup::f9d8e2daaa9144310b66bf948e50d656 [re3data_____::95014789f83d7611ebfddace19d0523a] \n",
"dedup::fcdbc4f504a15df8f78da88ee72fad32 [opendoar____::9f96f36b7aae3b1ff847c26ac94c604e] \n",
"dedup::fcfe9c770eb9372e6961a17f7eaffd5f [4637] \n",
"dedup::fe73f687e5bc5280214e0486b273a5f9 [330] \n",
"dedup::eded0708dfe855304a50029fccf1a677 [opendoar____::eccbc87e4b5ce2fe28308fd9f2a7baf... \n",
"dedup::ef46a43afd7c7d67e21f4306bb1364e9 [opendoar____::1f74a54f39b3123ad272ca0a06e7463... \n",
"dedup::f296bb3903d8a84d81c47e6db90764b9 [https://fairsharing.org/10.25504/FAIRsharing.... \n",
"dedup::f3dc6512e46961c363ea402ff218c8fb [re3data_____::b2fc675049cbf485d9abbccf5232a31... \n",
"dedup::f9aa64cbb57131939eda048250f2dbae [re3data_____::063765fa6d6358d62ea2d41dde32d3a... \n",
"\n",
" original_id \\\n",
"dedup_id \n",
"dedup::01846ae470651e97d2f73fce979406a9 [7668] \n",
"dedup::022036087426786cfd0f7f41fa7a2665 [https://fairsharing.org/10.25504/FAIRsharing.... \n",
"dedup::07e8b472e1e4af17a6b20ce083baf29f [15036] \n",
"dedup::0894634a3244e3050d8057a453e17e57 [https://fairsharing.org/10.25504/FAIRsharing.... \n",
"dedup::0a54b19a13b6712dc04d1b49215423d8 [377] \n",
"dedup::06138bc5af6023646ede0e1f7c1eac75 [https://fairsharing.org/10.25504/FAIRsharing.... \n",
"dedup::0b7e684c89e746c67c9761ce2b65479c [r3d100010423, 375, https://fairsharing.org/10... \n",
"dedup::0bb4aec1710521c12ee76289d9440817 [r3d100012596, 650, 349] \n",
"dedup::139042a4157a773f209847829d80894d [756, r3d100010690, 1330, 5487] \n",
"dedup::13d4bfa0321f86f042b34ec79064b316 [9411, 15255, r3d100013135] \n",
"... ... \n",
"dedup::f8306c8f16096b6d944799f4d427a976 [r3d100012041] \n",
"dedup::f9d8e2daaa9144310b66bf948e50d656 [r3d100011045] \n",
"dedup::fcdbc4f504a15df8f78da88ee72fad32 [4979] \n",
"dedup::fcfe9c770eb9372e6961a17f7eaffd5f [4637] \n",
"dedup::fe73f687e5bc5280214e0486b273a5f9 [330] \n",
"dedup::eded0708dfe855304a50029fccf1a677 [3, r3d100012604, 5509] \n",
"dedup::ef46a43afd7c7d67e21f4306bb1364e9 [5870, https://fairsharing.org/10.25504/FAIRsh... \n",
"dedup::f296bb3903d8a84d81c47e6db90764b9 [https://fairsharing.org/10.25504/FAIRsharing.... \n",
"dedup::f3dc6512e46961c363ea402ff218c8fb [r3d100012538, 10171, https://fairsharing.org/... \n",
"dedup::f9aa64cbb57131939eda048250f2dbae [r3d100012692, https://fairsharing.org/10.2550... \n",
"\n",
" name \\\n",
"dedup_id \n",
"dedup::01846ae470651e97d2f73fce979406a9 [digital commons at michigan state university ... \n",
"dedup::022036087426786cfd0f7f41fa7a2665 [World Data Center for Climate at DRKZ] \n",
"dedup::07e8b472e1e4af17a6b20ce083baf29f [MiCISAN] \n",
"dedup::0894634a3244e3050d8057a453e17e57 [European Variation Archive] \n",
"dedup::0a54b19a13b6712dc04d1b49215423d8 [yale medicine thesis digital library] \n",
"dedup::06138bc5af6023646ede0e1f7c1eac75 [Crystallography Open Database, Crystallograph... \n",
"dedup::0b7e684c89e746c67c9761ce2b65479c [Woods Hole Open Access Server, woods hole ope... \n",
"dedup::0bb4aec1710521c12ee76289d9440817 [Digitale Bibliothek Thüringen, digitale bibli... \n",
"dedup::139042a4157a773f209847829d80894d [Khazar University Institutional Repository, K... \n",
"dedup::13d4bfa0321f86f042b34ec79064b316 [fordatis, Fordatis, Fordatis] \n",
"... ... \n",
"dedup::f8306c8f16096b6d944799f4d427a976 [Canadian Disaster Database] \n",
"dedup::f9d8e2daaa9144310b66bf948e50d656 [Index to Marine & Lacustrine Geological Samples] \n",
"dedup::fcdbc4f504a15df8f78da88ee72fad32 [university of minnesota law school] \n",
"dedup::fcfe9c770eb9372e6961a17f7eaffd5f [Simon Fraser University Institutional Reposit... \n",
"dedup::fe73f687e5bc5280214e0486b273a5f9 [DigitalCommons@Fort Lewis College: Scholarshi... \n",
"dedup::eded0708dfe855304a50029fccf1a677 [ams acta, AMS Acta, AMS Acta] \n",
"dedup::ef46a43afd7c7d67e21f4306bb1364e9 [heidata, heiDATA, heiDATA] \n",
"dedup::f296bb3903d8a84d81c47e6db90764b9 [PubChem, PubChem, pubchem] \n",
"dedup::f3dc6512e46961c363ea402ff218c8fb [DataverseNO, dataverseno, DataverseNO] \n",
"dedup::f9aa64cbb57131939eda048250f2dbae [Scholars' Mine, Scholars' Mine, scholars mine] \n",
"\n",
" source \n",
" source \\\n",
"dedup_id \n",
"dedup::01846ae470651e97d2f73fce979406a9 [OpenDOAR] \n",
"dedup::022036087426786cfd0f7f41fa7a2665 [FAIRsharing] \n",
"dedup::07e8b472e1e4af17a6b20ce083baf29f [roar] \n",
"dedup::0894634a3244e3050d8057a453e17e57 [FAIRsharing] \n",
"dedup::0a54b19a13b6712dc04d1b49215423d8 [OpenDOAR] \n",
"dedup::06138bc5af6023646ede0e1f7c1eac75 [FAIRsharing, roar, re3data, OpenDOAR] \n",
"dedup::0b7e684c89e746c67c9761ce2b65479c [re3data, OpenDOAR, FAIRsharing] \n",
"dedup::0bb4aec1710521c12ee76289d9440817 [re3data, OpenDOAR, roar] \n",
"dedup::139042a4157a773f209847829d80894d [roar, re3data, OpenDOAR, roar] \n",
"dedup::13d4bfa0321f86f042b34ec79064b316 [OpenDOAR, roar, re3data] \n",
"... ... \n",
"dedup::f8306c8f16096b6d944799f4d427a976 [re3data] \n",
"dedup::f9d8e2daaa9144310b66bf948e50d656 [re3data] \n",
"dedup::fcdbc4f504a15df8f78da88ee72fad32 [OpenDOAR] \n",
"dedup::fcfe9c770eb9372e6961a17f7eaffd5f [roar] \n",
"dedup::fe73f687e5bc5280214e0486b273a5f9 [roar] \n",
"dedup::eded0708dfe855304a50029fccf1a677 [OpenDOAR, re3data, roar] \n",
"dedup::ef46a43afd7c7d67e21f4306bb1364e9 [OpenDOAR, FAIRsharing, re3data] \n",
"dedup::f296bb3903d8a84d81c47e6db90764b9 [FAIRsharing, re3data, OpenDOAR] \n",
"dedup::f3dc6512e46961c363ea402ff218c8fb [re3data, OpenDOAR, FAIRsharing] \n",
"dedup::f9aa64cbb57131939eda048250f2dbae [re3data, FAIRsharing, OpenDOAR] \n",
"\n",
"[109 rows x 4 columns]"
" source_set \n",
"dedup_id \n",
"dedup::06138bc5af6023646ede0e1f7c1eac75 {re3data, OpenDOAR, roar, FAIRsharing} \n",
"dedup::0b7e684c89e746c67c9761ce2b65479c {re3data, FAIRsharing, OpenDOAR} \n",
"dedup::0bb4aec1710521c12ee76289d9440817 {re3data, roar, OpenDOAR} \n",
"dedup::139042a4157a773f209847829d80894d {re3data, roar, OpenDOAR} \n",
"dedup::13d4bfa0321f86f042b34ec79064b316 {OpenDOAR, re3data, roar} \n",
"... ... \n",
"dedup::eded0708dfe855304a50029fccf1a677 {OpenDOAR, roar, re3data} \n",
"dedup::ef46a43afd7c7d67e21f4306bb1364e9 {OpenDOAR, re3data, FAIRsharing} \n",
"dedup::f296bb3903d8a84d81c47e6db90764b9 {re3data, OpenDOAR, FAIRsharing} \n",
"dedup::f3dc6512e46961c363ea402ff218c8fb {re3data, FAIRsharing, OpenDOAR} \n",
"dedup::f9aa64cbb57131939eda048250f2dbae {re3data, OpenDOAR, FAIRsharing} \n",
"\n",
"[65 rows x 5 columns]"
]
},
"execution_count": 9,
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup[dup.source.str.len() == 1]"
"dup[dup.source_set.str.len() >= 3]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([list(['r3d100013204', 'r3d100013458', 'r3d100012807', 'r3d100012808', 'r3d100012806', 'r3d100012805']),\n",
" list(['243', '5702', '5715', '5694', '5689', '5658', '5710', '5750', '5721', '5704']),\n",
" list(['2738', '4991', '2727', '2729', '2724', '2728', '2740', '174']),\n",
" list(['19', '8', '7', '11', '10', '13', '6', '12', '20', '15', '9', '5', '14', '16'])],\n",
" dtype=object)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dup[dup.source.str.len() >= 6].original_id.values"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 33,
"metadata": {},
"outputs": [
{
@ -1050,7 +860,7 @@
"5 None "
]
},
"execution_count": 11,
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}