In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

In [19]:
df_09 = pd.read_csv('../data/interim/ds_dedup09.csv', sep=';', quotechar='"', header=None, names=['dedup_id', 'duplicate_id', 'original_id', 'name', 'source'])
df_09.describe()

Unnamed: 0,dedup_id,duplicate_id,original_id,name,source
count,4513,4513,4513,4513,4513
unique,2145,4513,4261,3894,4
top,dedup::75e33da9b103b7b91dcd8da0abe1354b,https://fairsharing.org/bsg-d001520,3860,UPN JATIM REPOSITORY,roar
freq,5,1,2,4,1933


In [13]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t', 
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'repository_id': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'provider_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

In [14]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'subject': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'opendoar_id': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

In [15]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')

In [16]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])

In [17]:
df_09.head()

Unnamed: 0,dedup_id,duplicate_id,original_id,name,source
0,dedup::252773ebafcbbac75238b419d964068e,https://fairsharing.org/bsg-d001520,https://fairsharing.org/bsg-d001520,ACTRIS Data Centre,FAIRsharing
1,dedup::860320be12a1c050cd7731794e231bd3,opendoar____::2290a7385ed77cc5592dc2153229f082,1064,oxford university research archive,OpenDOAR
2,dedup::1aa7a8773e6a7fdacbcedf9999009a38,opendoar____::191f8f858acda435ae0daf994e2a72c2,8648,digital commons@georgia southern,OpenDOAR
3,dedup::4801441f041958afaca324c43c40787b,10453,10453,MCStor,roar
4,dedup::2841194266115ac1cc04d19630cde46b,re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5,r3d100011189,PRISM: University of Calgary's Digital Repository,re3data


In [44]:
dup = df_09.groupby('dedup_id').aggregate(list)
dup['source_set'] = dup.source.map(set)
dup

Unnamed: 0_level_0,duplicate_id,original_id,name,source,source_set
dedup_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dedup::000871c1fc726f0b52dc86a4eeb027de,"[4612, 4649]","[4612, 4649]","[IIT Bombay Institutional Repository, IIT Bomb...","[roar, roar]",{roar}
dedup::001e6d882e54c780ce269d3c46997287,[re3data_____::4af9fe2bb93511a5e0f0c39e94d6557...,"[r3d100011306, https://fairsharing.org/10.2550...","[RESID Database of Protein Modifications, RESI...","[re3data, FAIRsharing]","{re3data, FAIRsharing}"
dedup::0023a1e3447fdb31836536cc903f1310,[opendoar____::c6f798b844366ccd65d99bc7f31e0e0...,"[3410, 10013]",[erucu: electronic repository of the ukrainian...,"[OpenDOAR, roar]","{OpenDOAR, roar}"
dedup::003ab6b40af9b488decea7c582d150a2,[https://fairsharing.org/10.25504/FAIRsharing....,[https://fairsharing.org/10.25504/FAIRsharing....,"[Synapse, Synapse]","[FAIRsharing, re3data]","{re3data, FAIRsharing}"
dedup::0064f599ed0adb5870a5b3ffe438e485,"[16034, opendoar____::d1f157379ea7e51d4a8c07af...","[16034, 9647]","[Giresun University Institutional Repository, ...","[roar, OpenDOAR]","{OpenDOAR, roar}"
...,...,...,...,...,...
dedup::ff49cc40a8890e6a60f40ff3026d2730,"[1333, opendoar____::2bd7f907b7f5b6bbd91822c0c...","[1333, 1389]","[UnissResearch, unissresearch]","[roar, OpenDOAR]","{OpenDOAR, roar}"
dedup::ff4d70de478038c72282b7e4af1d4260,[opendoar____::95a0810a93a87065bf7b28490817e9e...,"[9752, 16367]","[european xfel publication database, European ...","[OpenDOAR, roar]","{OpenDOAR, roar}"
dedup::ff826ce6ee85809389f18a5fafe72366,[opendoar____::62e7f2e090fe150ef8deb4466fdc81b...,"[3601, 2608]",[electronic odessa national economic universit...,"[OpenDOAR, OpenDOAR]",{OpenDOAR}
dedup::ffbd6cbb019a1413183c8d08f2929307,"[3108, opendoar____::ff7d0f525b3be596a51fb9194...","[3108, 1912]","[Fotografía Sobre España en el Siglo XIX, foto...","[roar, OpenDOAR]","{OpenDOAR, roar}"


In [45]:
dup[dup.source_set.str.len() >= 3]

Unnamed: 0_level_0,duplicate_id,original_id,name,source,source_set
dedup_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dedup::06138bc5af6023646ede0e1f7c1eac75,[https://fairsharing.org/10.25504/FAIRsharing....,[https://fairsharing.org/10.25504/FAIRsharing....,"[Crystallography Open Database, Crystallograph...","[FAIRsharing, roar, re3data, OpenDOAR]","{re3data, OpenDOAR, roar, FAIRsharing}"
dedup::0b7e684c89e746c67c9761ce2b65479c,[re3data_____::44217da669f17a260c0958a679003a7...,"[r3d100010423, 375, https://fairsharing.org/10...","[Woods Hole Open Access Server, woods hole ope...","[re3data, OpenDOAR, FAIRsharing]","{re3data, FAIRsharing, OpenDOAR}"
dedup::0bb4aec1710521c12ee76289d9440817,[re3data_____::eb721a14697a05c477d0ae23830e665...,"[r3d100012596, 650, 349]","[Digitale Bibliothek Thüringen, digitale bibli...","[re3data, OpenDOAR, roar]","{re3data, roar, OpenDOAR}"
dedup::139042a4157a773f209847829d80894d,"[756, re3data_____::a95b34b344dc049963c35997fe...","[756, r3d100010690, 1330, 5487]","[Khazar University Institutional Repository, K...","[roar, re3data, OpenDOAR, roar]","{re3data, roar, OpenDOAR}"
dedup::13d4bfa0321f86f042b34ec79064b316,[opendoar____::e3844e186e6eb8736e9f53c0c588952...,"[9411, 15255, r3d100013135]","[fordatis, Fordatis, Fordatis]","[OpenDOAR, roar, re3data]","{OpenDOAR, re3data, roar}"
...,...,...,...,...,...
dedup::eded0708dfe855304a50029fccf1a677,[opendoar____::eccbc87e4b5ce2fe28308fd9f2a7baf...,"[3, r3d100012604, 5509]","[ams acta, AMS Acta, AMS Acta]","[OpenDOAR, re3data, roar]","{OpenDOAR, roar, re3data}"
dedup::ef46a43afd7c7d67e21f4306bb1364e9,[opendoar____::1f74a54f39b3123ad272ca0a06e7463...,"[5870, https://fairsharing.org/10.25504/FAIRsh...","[heidata, heiDATA, heiDATA]","[OpenDOAR, FAIRsharing, re3data]","{OpenDOAR, re3data, FAIRsharing}"
dedup::f296bb3903d8a84d81c47e6db90764b9,[https://fairsharing.org/10.25504/FAIRsharing....,[https://fairsharing.org/10.25504/FAIRsharing....,"[PubChem, PubChem, pubchem]","[FAIRsharing, re3data, OpenDOAR]","{re3data, OpenDOAR, FAIRsharing}"
dedup::f3dc6512e46961c363ea402ff218c8fb,[re3data_____::b2fc675049cbf485d9abbccf5232a31...,"[r3d100012538, 10171, https://fairsharing.org/...","[DataverseNO, dataverseno, DataverseNO]","[re3data, OpenDOAR, FAIRsharing]","{re3data, FAIRsharing, OpenDOAR}"


In [33]:
pd.DataFrame(re3data_df[re3data_df.re3data_id.isin(['r3d100013204', 'r3d100013458', 'r3d100012807', 'r3d100012808', 'r3d100012806', 'r3d100012805'])].institution.to_list())

Unnamed: 0,0,1,2,3,4
0,"[Harvard University, Institute for Quantitativ...","[Ontario Council of University Libraries, [CBU...","[Scholars Portal Dataverse, [], CAN, [general]...","[The Dataverse Project, [], AAA, [technical], ...","[University of Ottawa, Library, [Université d'..."
1,"[Dataverse Project, [], AAA, [technical], non-...","[Harvard University, Institute for Quantitave ...","[Ontario Council of University Libraries, [CBU...","[University of Toronto, Libraries, [], CAN, [g...",
2,"[Harvard University, Institute for Quantitativ...","[Ontario Council of University Libraries, [CBU...","[Scholars Portal Dataverse, [dataverse@scholar...","[The Dataverse Project, [], AAA, [technical], ...","[University of Windsor, [], CAN, [general], no..."
3,"[Harvard University, Institute for Quantitativ...","[Ontario Council of University Libraries, [CBU...","[Scholars Portal Dataverse, [], CAN, [general]...","[The Dataverse Project, [], AAA, [technical], ...","[University of Waterloo, [], CAN, [general], n..."
4,"[University of Victoria, [UVic], CAN, [general...","[University of Victoria, Libraries, [], CAN, [...",,,
5,"[The University of British Columbia, [], CAN, ...","[University of British Columbia, Library, [], ...",,,


In [12]:
roar_df[roar_df.eprintid.isin(['4612', '4649'])]

Unnamed: 0,eprintid,rev_number,eprint_status,userid,importid,source,dir,datestamp,lastmod,status_changed,type,succeeds,commentary,metadata_visibility,latitude,longitude,relation_type,relation_uri,item_issues_id,item_issues_type,item_issues_description,item_issues_timestamp,item_issues_status,item_issues_reported_by,item_issues_resolved_by,item_issues_comment,item_issues_count,sword_depositor,sword_slug,exemplar,home_page,title,oai_pmh,sword_endpoint,rss_feed,twitter_feed,description,fulltext,open_access,mandate,organisation_title,organisation_home_page,location_country,location_city,location_latitude,location_longitude,software,geoname,version,subjects,date,note,suggestions,activity_low,activity_medium,activity_high,recordcount,recordhistory,fulltexts_total,fulltexts_docs,fulltexts_rtotal,fulltexts_rdocs,registry_name,registry_id,submit_to,submitted_to_name,submitted_to_done,webometrics_rank,webometrics_size,webometrics_visibility,webometrics_rich_files,webometrics_scholar,monthly_deposits,total_deposits,association
4167,4612,28.0,archive,1380.0,,,disk0/00/00/46/12,2012-01-08 03:17:02,2012-04-16 10:53:04,2012-01-08 03:17:02,institutional,,,show,,,,,,,,,,,,,,,,,http://dspace.library.iitb.ac.in/jspui/,IIT Bombay Institutional Repository,http://dspace.library.iitb.ac.in/oai/request,,http://dspace.library.iitb.ac.in/xmlui/feed/at...,,,True,True,True,IIT Bombay,http://www.iitb.ac.in,in,Mumbai,19.133,72.9166,dspace,geoname_2_IN,other,TA,2011-12-15 09:01:35,,,0.0,0.0,0.0,99.0,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,95,9...",,,,,celestial,4790.0,,,,,,,,,,,
4168,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TD,,,,,,,,,,,,,,,,,,,,,,,,,
4169,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TH,,,,,,,,,,,,,,,,,,,,,,,,,
4170,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TJ,,,,,,,,,,,,,,,,,,,,,,,,,
4171,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TK,,,,,,,,,,,,,,,,,,,,,,,,,
4172,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TN,,,,,,,,,,,,,,,,,,,,,,,,,
4173,4612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TP,,,,,,,,,,,,,,,,,,,,,,,,,
16436,4649,8.0,archive,1380.0,,,disk0/00/00/46/49,2012-02-05 13:57:01,2012-04-16 10:39:58,2012-02-05 13:57:01,institutional,,,show,,,,,,,,,,,,,,,,,http://dspace.library.iitb.ac.in/jspui/,IIT Bombay Institutional Repository,http://dspace.library.iitb.ac.in/oai,,http://dspace.library.iitb.ac.in/xmlui/feed/rs...,,,True,True,False,IIT Bombay,http://www.iitb.ac.in,in,Mumbai,19.133,72.9166,dspace,geoname_2_IN,other,T1,2012-01-05 12:09:37,,,,,,,,,,,,celestial,4789.0,,,,,,,,,,,
16437,4649,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TA,,,,,,,,,,,,,,,,,,,,,,,,,


In [13]:
re3data_df[re3data_df.re3data_id == 'r3d100011306']

Unnamed: 0,openaire_id,re3data_id,repository_name,additional_name,repository_url,repository_id,description,type,size,update_date,start_date,end_date,subject,mission_statement,content_type,provider_type,keyword,institution,policy,database_access,database_license,data_access,data_license,data_upload,data_upload_license,software,versioning,api,pid_system,citation_guideline_url,aid_system,enhanced_publication,quality_management,certificate,metadata_standard,syndication,remarks,entry_date,last_update
1090,re3data_____::4af9fe2bb93511a5e0f0c39e94d6557f,r3d100011306,RESID Database of Protein Modifications,[],https://pir.georgetown.edu/resid/resid.shtml,"[FAIRsharing_doi:10.25504/FAIRsharing.qaszjp, ...",The RESID Database of Protein Modifications is...,[disciplinary],,,2014,,"[2 Life Sciences, 201 Basic Biological and Med...",False,"[Images, Structured text]",[dataProvider],"[genomes, life sciences, proteins, proteomes, ...","[[Georgetown University, Medical Center, [GUMC...",True,True,False,True,True,True,False,True,yes,True,True,True,True,yes,unknown,False,False,False,RESID is covered by Thomson Reuters Data Citat...,2014-12-05,2019-01-17
