In [41]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

In [46]:
df_09 = pd.read_csv('../data/interim/ds_dedup09.csv', sep=';')
df_09.head(10)

Unnamed: 0,dedup_id,duplicate_id,original_id,name,source
0,dedup::29a83a8a9641bb860a679d7e5ba52d26,14174,14174,OHIO Open Library | Ohio University Research,roar
1,dedup::000871c1fc726f0b52dc86a4eeb027de,4612,4612,IIT Bombay Institutional Repository,roar
2,dedup::000871c1fc726f0b52dc86a4eeb027de,4649,4649,IIT Bombay Institutional Repository,roar
3,dedup::001e6d882e54c780ce269d3c46997287,re3data_____::4af9fe2bb93511a5e0f0c39e94d6557f,r3d100011306,RESID Database of Protein Modifications,re3data
4,dedup::001e6d882e54c780ce269d3c46997287,https://fairsharing.org/10.25504/FAIRsharing.q...,https://fairsharing.org/10.25504/FAIRsharing.q...,RESID Database of Protein Modifications,FAIRsharing
5,dedup::0023a1e3447fdb31836536cc903f1310,opendoar____::c6f798b844366ccd65d99bc7f31e0e02,3410,erucu: electronic repository of the ukrainian ...,OpenDOAR
6,dedup::0023a1e3447fdb31836536cc903f1310,10013,10013,ErUCU: Electronic repository of the Ukrainian ...,roar
7,dedup::003ab6b40af9b488decea7c582d150a2,https://fairsharing.org/10.25504/FAIRsharing.d...,https://fairsharing.org/10.25504/FAIRsharing.d...,Synapse,FAIRsharing
8,dedup::003ab6b40af9b488decea7c582d150a2,re3data_____::cafc5d99b7c187e24b40d958a16a91f1,r3d100011894,Synapse,re3data
9,dedup::0064f599ed0adb5870a5b3ffe438e485,16034,16034,Giresun University Institutional Repository,roar


In [24]:
df_09[df_09.dedup_id == 'dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98']

Unnamed: 0,dedup_id,duplicate_id,original_id,name,source
5115,dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98,1126,1126,RIT Digital Media Library,roar
5116,dedup::ffeed84c7cb1ae7bf4ec4bd78275bb98,opendoar____::443cb001c138b2561a0d90720d6ce111,648,rit digital media library,OpenDOAR


In [25]:
df_09[df_09.dedup_id == 'dedup::01846ae470651e97d2f73fce979406a9']

Unnamed: 0,dedup_id,duplicate_id,original_id,name,source
40,dedup::01846ae470651e97d2f73fce979406a9,opendoar____::b4d6f2b565ca0eef1f9245403aac366a,7668,digital commons at michigan state university c...,OpenDOAR


In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t', 
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'repository_id': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'provider_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

In [4]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'subject': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'opendoar_id': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

In [5]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')

In [6]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])

In [7]:
df_09.head()

Unnamed: 0,dedup_id,duplicate_id,original_id,name,source
0,dedup::29a83a8a9641bb860a679d7e5ba52d26,14174,14174,OHIO Open Library | Ohio University Research,roar
1,dedup::000871c1fc726f0b52dc86a4eeb027de,4612,4612,IIT Bombay Institutional Repository,roar
2,dedup::000871c1fc726f0b52dc86a4eeb027de,4649,4649,IIT Bombay Institutional Repository,roar
3,dedup::001e6d882e54c780ce269d3c46997287,re3data_____::4af9fe2bb93511a5e0f0c39e94d6557f,r3d100011306,RESID Database of Protein Modifications,re3data
4,dedup::001e6d882e54c780ce269d3c46997287,https://fairsharing.org/10.25504/FAIRsharing.q...,https://fairsharing.org/10.25504/FAIRsharing.q...,RESID Database of Protein Modifications,FAIRsharing


In [8]:
dup = df_09.groupby('dedup_id').aggregate(list)
dup

Unnamed: 0_level_0,duplicate_id,original_id,name,source
dedup_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dedup::000871c1fc726f0b52dc86a4eeb027de,"[4612, 4649]","[4612, 4649]","[IIT Bombay Institutional Repository, IIT Bomb...","[roar, roar]"
dedup::001e6d882e54c780ce269d3c46997287,[re3data_____::4af9fe2bb93511a5e0f0c39e94d6557...,"[r3d100011306, https://fairsharing.org/10.2550...","[RESID Database of Protein Modifications, RESI...","[re3data, FAIRsharing]"
dedup::0023a1e3447fdb31836536cc903f1310,[opendoar____::c6f798b844366ccd65d99bc7f31e0e0...,"[3410, 10013]",[erucu: electronic repository of the ukrainian...,"[OpenDOAR, roar]"
dedup::003ab6b40af9b488decea7c582d150a2,[https://fairsharing.org/10.25504/FAIRsharing....,[https://fairsharing.org/10.25504/FAIRsharing....,"[Synapse, Synapse]","[FAIRsharing, re3data]"
dedup::0064f599ed0adb5870a5b3ffe438e485,"[16034, opendoar____::d1f157379ea7e51d4a8c07af...","[16034, 9647]","[Giresun University Institutional Repository, ...","[roar, OpenDOAR]"
...,...,...,...,...
dedup::ff49cc40a8890e6a60f40ff3026d2730,"[1333, opendoar____::2bd7f907b7f5b6bbd91822c0c...","[1333, 1389]","[UnissResearch, unissresearch]","[roar, OpenDOAR]"
dedup::ff4d70de478038c72282b7e4af1d4260,[opendoar____::95a0810a93a87065bf7b28490817e9e...,"[9752, 16367]","[european xfel publication database, European ...","[OpenDOAR, roar]"
dedup::ff826ce6ee85809389f18a5fafe72366,[opendoar____::62e7f2e090fe150ef8deb4466fdc81b...,"[3601, 2608]",[electronic odessa national economic universit...,"[OpenDOAR, OpenDOAR]"
dedup::ffbd6cbb019a1413183c8d08f2929307,"[3108, opendoar____::ff7d0f525b3be596a51fb9194...","[3108, 1912]","[Fotografía Sobre España en el Siglo XIX, foto...","[roar, OpenDOAR]"


In [9]:
dup[dup.source.str.len() == 1]

Unnamed: 0_level_0,duplicate_id,original_id,name,source
dedup_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dedup::01846ae470651e97d2f73fce979406a9,[opendoar____::b4d6f2b565ca0eef1f9245403aac366a],[7668],[digital commons at michigan state university ...,[OpenDOAR]
dedup::022036087426786cfd0f7f41fa7a2665,[https://fairsharing.org/10.25504/FAIRsharing....,[https://fairsharing.org/10.25504/FAIRsharing....,[World Data Center for Climate at DRKZ],[FAIRsharing]
dedup::07e8b472e1e4af17a6b20ce083baf29f,[15036],[15036],[MiCISAN],[roar]
dedup::0894634a3244e3050d8057a453e17e57,[https://fairsharing.org/10.25504/FAIRsharing....,[https://fairsharing.org/10.25504/FAIRsharing....,[European Variation Archive],[FAIRsharing]
dedup::0a54b19a13b6712dc04d1b49215423d8,[opendoar____::d34ab169b70c9dcd35e62896010cd9ff],[377],[yale medicine thesis digital library],[OpenDOAR]
...,...,...,...,...
dedup::f8306c8f16096b6d944799f4d427a976,[re3data_____::574b553c6c374d597d2068ab2b117889],[r3d100012041],[Canadian Disaster Database],[re3data]
dedup::f9d8e2daaa9144310b66bf948e50d656,[re3data_____::95014789f83d7611ebfddace19d0523a],[r3d100011045],[Index to Marine & Lacustrine Geological Samples],[re3data]
dedup::fcdbc4f504a15df8f78da88ee72fad32,[opendoar____::9f96f36b7aae3b1ff847c26ac94c604e],[4979],[university of minnesota law school],[OpenDOAR]
dedup::fcfe9c770eb9372e6961a17f7eaffd5f,[4637],[4637],[Simon Fraser University Institutional Reposit...,[roar]


In [10]:
dup[dup.source.str.len() >= 6].original_id.values

array([list(['r3d100013204', 'r3d100013458', 'r3d100012807', 'r3d100012808', 'r3d100012806', 'r3d100012805']),
       list(['243', '5702', '5715', '5694', '5689', '5658', '5710', '5750', '5721', '5704']),
       list(['2738', '4991', '2727', '2729', '2724', '2728', '2740', '174']),
       list(['19', '8', '7', '11', '10', '13', '6', '12', '20', '15', '9', '5', '14', '16'])],
      dtype=object)

In [11]:
pd.DataFrame(re3data_df[re3data_df.re3data_id.isin(['r3d100013204', 'r3d100013458', 'r3d100012807', 'r3d100012808', 'r3d100012806', 'r3d100012805'])].institution.to_list())

Unnamed: 0,0,1,2,3,4
0,"[Harvard University, Institute for Quantitativ...","[Ontario Council of University Libraries, [CBU...","[Scholars Portal Dataverse, [], CAN, [general]...","[The Dataverse Project, [], AAA, [technical], ...","[University of Ottawa, Library, [Université d'..."
1,"[Dataverse Project, [], AAA, [technical], non-...","[Harvard University, Institute for Quantitave ...","[Ontario Council of University Libraries, [CBU...","[University of Toronto, Libraries, [], CAN, [g...",
2,"[Harvard University, Institute for Quantitativ...","[Ontario Council of University Libraries, [CBU...","[Scholars Portal Dataverse, [dataverse@scholar...","[The Dataverse Project, [], AAA, [technical], ...","[University of Windsor, [], CAN, [general], no..."
3,"[Harvard University, Institute for Quantitativ...","[Ontario Council of University Libraries, [CBU...","[Scholars Portal Dataverse, [], CAN, [general]...","[The Dataverse Project, [], AAA, [technical], ...","[University of Waterloo, [], CAN, [general], n..."
4,"[University of Victoria, [UVic], CAN, [general...","[University of Victoria, Libraries, [], CAN, [...",,,
5,"[The University of British Columbia, [], CAN, ...","[University of British Columbia, Library, [], ...",,,


In [12]:
roar_df[roar_df.eprintid.isin(['4612', '4649'])]

Unnamed: 0,eprintid,rev_number,eprint_status,userid,importid,source,dir,datestamp,lastmod,status_changed,type,succeeds,commentary,metadata_visibility,latitude,longitude,relation_type,relation_uri,item_issues_id,item_issues_type,item_issues_description,item_issues_timestamp,item_issues_status,item_issues_reported_by,item_issues_resolved_by,item_issues_comment,item_issues_count,sword_depositor,sword_slug,exemplar,home_page,title,oai_pmh,sword_endpoint,rss_feed,twitter_feed,description,fulltext,open_access,mandate,organisation_title,organisation_home_page,location_country,location_city,location_latitude,location_longitude,software,geoname,version,subjects,date,note,suggestions,activity_low,activity_medium,activity_high,recordcount,recordhistory,fulltexts_total,fulltexts_docs,fulltexts_rtotal,fulltexts_rdocs,registry_name,registry_id,submit_to,submitted_to_name,submitted_to_done,webometrics_rank,webometrics_size,webometrics_visibility,webometrics_rich_files,webometrics_scholar,monthly_deposits,total_deposits,association
4167,4612,28.0,archive,1380.0,,,disk0/00/00/46/12,2012-01-08 03:17:02,2012-04-16 10:53:04,2012-01-08 03:17:02,institutional,,,show,,,,,,,,,,,,,,,,,http://dspace.library.iitb.ac.in/jspui/,IIT Bombay Institutional Repository,http://dspace.library.iitb.ac.in/oai/request,,http://dspace.library.iitb.ac.in/xmlui/feed/at...,,,True,True,True,IIT Bombay,http://www.iitb.ac.in,in,Mumbai,19.133,72.9166,dspace,geoname_2_IN,other,TA,2011-12-15 09:01:35,,,0.0,0.0,0.0,99.0,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,95,9...",,,,,celestial,4790.0,,,,,,,,,,,
4168,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TD,,,,,,,,,,,,,,,,,,,,,,,,,
4169,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TH,,,,,,,,,,,,,,,,,,,,,,,,,
4170,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TJ,,,,,,,,,,,,,,,,,,,,,,,,,
4171,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TK,,,,,,,,,,,,,,,,,,,,,,,,,
4172,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TN,,,,,,,,,,,,,,,,,,,,,,,,,
4173,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TP,,,,,,,,,,,,,,,,,,,,,,,,,
16436,4649,8.0,archive,1380.0,,,disk0/00/00/46/49,2012-02-05 13:57:01,2012-04-16 10:39:58,2012-02-05 13:57:01,institutional,,,show,,,,,,,,,,,,,,,,,http://dspace.library.iitb.ac.in/jspui/,IIT Bombay Institutional Repository,http://dspace.library.iitb.ac.in/oai,,http://dspace.library.iitb.ac.in/xmlui/feed/rs...,,,True,True,False,IIT Bombay,http://www.iitb.ac.in,in,Mumbai,19.133,72.9166,dspace,geoname_2_IN,other,T1,2012-01-05 12:09:37,,,,,,,,,,,,celestial,4789.0,,,,,,,,,,,
16437,4649,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TA,,,,,,,,,,,,,,,,,,,,,,,,,


In [13]:
re3data_df[re3data_df.re3data_id == 'r3d100011306']

Unnamed: 0,openaire_id,re3data_id,repository_name,additional_name,repository_url,repository_id,description,type,size,update_date,start_date,end_date,subject,mission_statement,content_type,provider_type,keyword,institution,policy,database_access,database_license,data_access,data_license,data_upload,data_upload_license,software,versioning,api,pid_system,citation_guideline_url,aid_system,enhanced_publication,quality_management,certificate,metadata_standard,syndication,remarks,entry_date,last_update
1090,re3data_____::4af9fe2bb93511a5e0f0c39e94d6557f,r3d100011306,RESID Database of Protein Modifications,[],https://pir.georgetown.edu/resid/resid.shtml,"[FAIRsharing_doi:10.25504/FAIRsharing.qaszjp, ...",The RESID Database of Protein Modifications is...,[disciplinary],,,2014,,"[2 Life Sciences, 201 Basic Biological and Med...",False,"[Images, Structured text]",[dataProvider],"[genomes, life sciences, proteins, proteomes, ...","[[Georgetown University, Medical Center, [GUMC...",True,True,False,True,True,True,False,True,yes,True,True,True,True,yes,unknown,False,False,False,RESID is covered by Thomson Reuters Data Citat...,2014-12-05,2019-01-17
