registries_analysis/notebooks/03-overlap.ipynb

696 KiB
Raw Blame History

In [1]:
import ast
import csv
import json
import glom

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading data from registries

In [2]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df['unique_id'] = 'FAIRsharing_' + fairsharing_df.id
fairsharing_df = fairsharing_df.add_prefix('FAIRsharing_')
fairsharing_df.head()
Out[2]:
FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id
0 1723 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te Cell Image Library ready [{'contact-name': 'David Orloff', 'contact-ema... http://www.cellimagelibrary.org 1723 This library is a public and easily accessible... [{'url': 'http://www.cellimagelibrary.org/page... 2010.0 [{'name': 'live update', 'type': 'data release... [biodbcore-000180, bsg-d000180] Database repository [Cell Biology, Life Science] [Cell, Microscopy, Light microscopy, Electron ... [All] [] [United States] FAIRsharing record for: Cell Image Library None https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8t18te https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: This librar... [{'id': 232, 'pubmed_id': 23203874, 'title': '... [{'licence-name': 'Cell Image Library Data Pol... NaN NaN NaN NaN NaN NaN NaN FAIRsharing_1723
1 3101 fairsharing-records 2020-09-16T08:49:13.000Z 2021-09-30T11:36:45.452Z NaN WHOI Ship Data-Grabber System ready NaN http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html 3101 The WHOI Ship DataGrabber system provides the ... [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... 2004.0 [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... [biodbcore-001609, bsg-d001609] Database repository [Earth Science, Water Research, Oceanography] [] [Not applicable] [subseafloor environments] [United States] FAIRsharing record for: WHOI Ship Data-Grabber... None https://fairsharing.org/fairsharing_records/3101 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The WHOI Sh... [] [{'licence-name': 'NDSF Data Archive Policy', ... NaN NaN NaN NaN NaN NaN NaN FAIRsharing_3101
2 2649 fairsharing-records 2018-08-07T20:23:32.000Z 2021-09-30T11:39:07.898Z NaN Electron Microscope Public Image Archive ready [{'contact-name': 'General contact', 'contact-... https://www.ebi.ac.uk/pdbe/emdb/empiar/ 2649 EMPIAR, the Electron Microscopy Public Image A... [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... 2015.0 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... [biodbcore-001140, bsg-d001140] Database repository [Bioinformatics, Biology] [Protein image, Microscopy, Electron microscop... [All] [] [Greece, Czech Republic, United Kingdom, Icela... FAIRsharing record for: Electron Microscope Pu... EMPIAR https://fairsharing.org/fairsharing_records/2649 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: EMPIAR, the... [{'id': 2232, 'pubmed_id': 27067018, 'title': ... [{'licence-name': 'EMBL-EBI Terms of Use', 'li... [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... EMPIAR [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... NaN NaN NaN FAIRsharing_2649
3 2657 fairsharing-records 2018-08-13T15:12:11.000Z 2021-09-30T11:37:28.736Z 10.25504/FAIRsharing.tnByoG ClinicalStudyDataRequest.com ready [{'contact-email': 'support@clinicalstudydatar... https://clinicalstudydatarequest.com/ 2657 ClinicalStudyDataRequest.com (CSDR) is a conso... [{'url': 'https://clinicalstudydatarequest.com... 2014.0 [{'url': 'https://clinicalstudydatarequest.com... [biodbcore-001149, bsg-d001149] Database repository [Preclinical Studies, Biomedical Science] [] [Homo sapiens] [] [Worldwide] FAIRsharing record for: ClinicalStudyDataReque... CSDR https://fairsharing.org/10.25504/FAIRsharing.t... 10.25504/FAIRsharing.tnByoG https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: ClinicalStu... [] [{'licence-name': 'CSDR Data Sharing Agreement... NaN CSDR NaN NaN NaN NaN NaN FAIRsharing_2657
4 2078 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:34:43.129Z 10.25504/FAIRsharing.3axym7 Germplasm Resources Information Network ready [{'contact-email': 'dbmu@ars-grin.gov'}] https://www.ars-grin.gov/ 2078 GRIN provides National Genetic Resources Progr... [{'url': 'https://www.ars-grin.gov/Pages/Colle... 2010.0 [{'url': 'https://www.ars-grin.gov/', 'name': ... [biodbcore-000546, bsg-d000546] Database repository [Life Science] [Cell, Cell culture, Germplasm] [Bacteria, Metazoa, Viridiplantae] [] [United States] FAIRsharing record for: Germplasm Resources In... GRIN https://fairsharing.org/10.25504/FAIRsharing.3... 10.25504/FAIRsharing.3axym7 https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: GRIN provid... [] [] NaN GRIN NaN NaN NaN NaN NaN FAIRsharing_2078
In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df['unique_id'] = 're3data_' + re3data_df.orgIdentifier
re3data_df = re3data_df.add_prefix('re3data_')
re3data_df.head()
Out[3]:
re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id
0 r3d100000001 Odum Institute Archive Dataverse eng [] https://dataverse.unc.edu/dataverse/odum [] ["https://dataverse.unc.edu/dataverse/odum#", ... The Odum Institute Archive Dataverse contains ... eng [disciplinary] {"size": "13 dataverses; 3.050 datasets", "upd... NaN NaN ["eng"] [1 Humanities and Social Sciences, 111 Social ... NaN [Databases, Plain text, Scientific and statist... [dataProvider] [FAIR, Middle East, crime, demography, economy... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC0", "databaseLicen... [{"dataAccessType": "embargoed", "dataAccessRe... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["DataVerse"] NaN [] ["DOI"] NaN [] unknown yes ["other"] [{"metadataStandardName": "DDI - Data Document... {} Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06 re3data_r3d100000001
1 r3d100000002 Access to Archival Databases eng [{'additionalName': 'AAD', 'additionalNameLang... https://aad.archives.gov/aad/ [RRID:SCR_010479, RRID:nlx_157752] ["https://www.archives.gov/contact"] You will find in the Access to Archival Databa... eng [disciplinary] {"size": "", "updatedp": ""} 1985 NaN ["eng", "spa"] [1 Humanities and Social Sciences, 102 History... https://www.archives.gov/publications/general-... [Images, Standard office documents, Structured... [dataProvider] [US History] [{'institutionName': 'The U.S. National Archiv... [{"policyName": "Contribution Policy", "policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "Copyrights", "dataLicens... restricted [] ["unknown"] no ["https://www.archives.gov/developer#toc-appli... ["none"] https://aad.archives.gov/aad/help/getting-star... [] unknown unknown [] [] {"syndication": "http://www.archives.gov/socia... NaN 2012-07-04 2021-05-25 re3data_r3d100000002
2 r3d100000004 Datenbank Gesprochenes Deutsch deu [{'additionalName': 'DGD', 'additionalNameLang... https://dgd.ids-mannheim.de/ [] ["dgd@ids-mannheim.de"] The "Database for Spoken German (DGD)" is a co... eng [disciplinary] {"size": "34 corpora", "updatedp": "2020-02-03"} 2012 NaN ["deu"] [1 Humanities and Social Sciences, 104 Linguis... https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... [Audiovisual data, Standard office documents, ... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [{'institutionName': 'Institut für Deutsche Sp... [{"policyName": "Erfurter Aufruf zur Sicherung... {"databaseAccessType": "restricted", "databas... [] [{"dataAccessType": "restricted", "dataAccessR... [{"dataLicenseName": "other", "dataLicenseURL"... restricted [] ["other"] yes [] ["none"] http://agd.ids-mannheim.de/konditionen.shtml [] unknown unknown ["RatSWD"] [] {} NaN 2012-07-20 2020-08-27 re3data_r3d100000004
3 r3d100000005 UNC Dataverse eng [{'additionalName': 'University of North Carol... https://dataverse.unc.edu/ [] ["https://dataverse.unc.edu/", "odumarchive@un... UNC Dataverse is an open-source repository sof... eng [institutional] {"size": "186 dataverses; 25.272 studies; 229.... 2011 NaN ["eng"] [1 Humanities and Social Sciences, 111 Social ... https://odum.unc.edu/about/mission-vision/ [Archived data, Plain text, Raw data, Scientif... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Data Deposit Form"... ["DataVerse"] yes ["https://guides.dataverse.org/en/latest/api/n... ["ARK", "DOI", "PURL", "URN", "hdl"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [{"metadataStandardName": "DDI - Data Document... {} UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-08-11 re3data_r3d100000005
4 r3d100000006 Archaeology Data Service eng [{'additionalName': 'ADS', 'additionalNameLang... https://archaeologydataservice.ac.uk/ [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] ["help@archaeologydataservice.ac.uk", "https:/... The ADS is an accredited digital repository fo... eng [disciplinary] {"size": "1837 results", "updatedp": "2020-05-... 1996-10-01 NaN ["eng"] [1 Humanities and Social Sciences, 101 Ancient... https://archaeologydataservice.ac.uk/about/our... [Archived data, Audiovisual data, Databases, I... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [{'institutionName': 'Arts and Humanities Rese... [{"policyName": "ADS Guides to good practice",... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC", "databaseLicens... [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Guidelines for Dep... ["other"] yes ["https://archaeologydataservice.ac.uk/about/e... ["DOI"] https://archaeologydataservice.ac.uk/advice/te... [] unknown yes ["other"] [{"metadataStandardName": "DataCite Metadata S... {"syndication": "https://archaeologydataservic... ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02 re3data_r3d100000006
In [4]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects_phrases': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df['unique_id'] = 'OpenDOAR_' + opendoar_df['system_metadata.id']
opendoar_df = opendoar_df.add_prefix('OpenDOAR_')
opendoar_df.head()
Out[4]:
OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects_phrases OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_unique_id
0 175 {"name": "hku theses online", "language": "en"} [] http://hub.hku.hk/handle/10722/1057 this is an institutional repository providing ... institutional ["zh", "en"] 2021-03-25 10:16:18 2005-12-21 12:44:08 [multidisciplinary] [bibliographic_references, theses_and_disserta... [{'name': 'university of hong kong', 'alternat... [] {"name": "dspace", "version": "cris-5.3.1-snap... NaN yes OpenDOAR_175
1 64 {"name": "research support scheme - central eu... [] http://rss.archives.ceu.hu/ this is an institutional repository collecting... institutional ["cs", "en", "hu", "ru"] 2021-03-25 09:48:31 2006-01-04 14:59:30 [multidisciplinary] [unpub_reports_and_working_papers] [{'name': 'central european university', 'alte... [] {"name": "eprints", "version": "2.2.1"} http://rss.archives.ceu.hu/perl/oai2 yes OpenDOAR_64
2 151 {"name": "cadmus, eui research repository", "l... [] http://cadmus.eui.eu/ cadmus is the name of the eui research reposit... institutional ["nl", "en", "fr", "de", "it"] 2021-09-13 13:35:36 2006-01-04 12:07:07 [history and archaeology, multidisciplinary, s... [journal_articles, theses_and_dissertations, u... [{'name': 'european university institute', 'al... [{"policy_url": "https://www.eui.eu/research/e... {"name": "dspace", "version": "5.2"} http://cadmus.eui.eu/oai/request yes OpenDOAR_151
3 105 {"name": "document server@uhasselt", "language... [] https://doclib.uhasselt.be/dspace/ this site is a university repository providing... institutional ["nl", "en", "fr", "de"] 2021-04-16 15:23:52 2006-01-24 15:46:44 [multidisciplinary] [journal_articles, conference_and_workshop_pap... [{'name': 'uhasselt', 'alternativeName': 'hass... [] {"name": "dspace", "version": "1.7.2"} http://doclib.uhasselt.be/dspace-oai/request yes OpenDOAR_105
4 101 {"name": "utrecht university repository", "lan... [] http://dspace.library.uu.nl this site is a university repository providing... institutional ["nl", "en"] 2021-04-16 15:22:03 2006-01-13 12:55:13 [multidisciplinary] [journal_articles, conference_and_workshop_pap... [{'name': 'university of utrecht', 'alternativ... [] {"name": "dspace", "version": ""} https://dspace.library.uu.nl/oai/request yes OpenDOAR_101
In [5]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)

def value_or_list(cell_set):
    copy = set(cell_set)
    copy.discard(np.nan) 
    if len(copy) == 0:
        return np.nan
    if len(copy) == 1:
        return copy.pop()
    return list(copy)
        
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)

roar_df['unique_id'] = 'roar_' + roar_df.eprintid
roar_df = roar_df.add_prefix('roar_')
roar_df.head()
Out[5]:
roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
0 1 633 archive 1 NaN NaN disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-18 05:40:07 2010-01-06 13:43:48 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://archivesic.ccsd.cnrs.fr/ @RCHIVESIC http://archivesic.ccsd.cnrs.fr/oai/oai.php NaN NaN NaN NaN NaN NaN NaN NaN NaN fr NaN NaN NaN hal geoname_2_FR other NaN 2002-05-17 19:24:41 NaN NaN 0 0 0 25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... NaN NaN NaN NaN [opendoar, celestial] [669, 58] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_1
1 10 511 archive 1 NaN NaN disk0/00/00/00/10 2010-01-06 13:43:48 2011-07-18 05:40:13 2010-01-06 13:43:48 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://www.diva-portal.org/mdh/ Academic Archive On-line (Mälardalen Universit... http://www.diva-portal.org/oai/mdh/OAI NaN NaN NaN NaN TRUE TRUE NaN NaN NaN se Uppsala 59.8667 17.6333 diva geoname_2_SE other NaN 2005-12-08 13:15:22 NaN NaN 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,100... NaN NaN NaN NaN [opendoar, celestial] [526, 258] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_10
2 1000 274 archive 1 NaN NaN disk0/00/00/10/00 2010-01-06 13:45:01 2011-07-06 08:21:21 2010-01-06 13:45:01 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://pam.pisharp.org/ PAM - Portuguese Archive of Mathematics NaN NaN NaN NaN NaN TRUE TRUE NaN NaN NaN pt Bellevue, WA 47.6034 -122.155 dspace geoname_2_PT other NaN 2006-05-04 10:48:14 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_1000
3 10001 20 archive 91 NaN NaN disk0/00/01/00/01 2015-08-08 14:52:11 2016-03-21 19:44:01 2015-08-08 14:52:11 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://edoc.sub.uni-hamburg.de/klimawandel/ Klimawandel Dokumentenserver http://edoc.sub.uni-hamburg.de/klimawandel/oai NaN NaN NaN The "Documentenserver Klimawandel" (Repository... TRUE TRUE TRUE [Climate Service Center 2.0, Helmholtz-Zentrum... [http://www.hzg.de/, http://www.klimzug.de/de/... de Hamburg 53.5511 9.9937 opus geoname_2_DE other [GE, GF, G1, S1, HD] 2015-07-02 08:08:31 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [opendoar, celestial] [3408, 5881] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_10001
4 10008 11 archive 404 NaN NaN disk0/00/01/00/08 2015-08-08 14:52:26 2016-03-21 19:43:51 2015-08-08 14:52:26 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://creativematter.skidmore.edu/ Creative Matter | Skidmore College Research http://creativematter.skidmore.edu/do/oai/ NaN http://creativematter.skidmore.edu/recent.rss NaN Welcome to Creative Matter, a repository for t... TRUE FALSE FALSE Skidmore College http://www.skidmore.edu/ us Saratoga Springs 43.0961 -73.7818 bepress geoname_2_US other NaN 2015-07-06 17:35:50 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN celestial 5882 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_10008
In [6]:
roar_df[roar_df.roar_eprintid == '10013']
Out[6]:
roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
7 10013 31 archive 7104 NaN NaN disk0/00/01/00/13 2015-08-08 14:53:04 2016-03-21 19:54:43 2015-08-08 14:53:04 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://er.ucu.edu.ua/ ErUCU: Electronic repository of the Ukrainian ... http://er.ucu.edu.ua/oai/request http://er.ucu.edu.ua/sword/ http://er.ucu.edu.ua/feed/rss_2.0/site NaN Ukrainian Catholic Universitys institutional ... TRUE TRUE TRUE Ukrainian Catholic University http://ucu.edu.ua/eng/ ua Lviv NaN NaN dspace geoname_2_UA other [H1, L1, AC, D204, B1, D1, DK, BF, BS, HM, BL,... 2015-07-07 12:38:37 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [opendoar, celestial] [3410, 5883] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [russell_group, ivy_league] roar_10013

Loading dedup results

In [7]:
dup = pd.read_csv('../data/interim/fairsharing_dedup.csv', sep=';', quotechar='"', header=None, names=['dedup_id', 'duplicate_id', 'original_id', 'name', 'source'])
dup['unique_id'] = dup.source + '_' + dup.original_id
dup.head()
Out[7]:
dedup_id duplicate_id original_id name source unique_id
0 dedup::860320be12a1c050cd7731794e231bd3 opendoar____::2290a7385ed77cc5592dc2153229f082 1064 oxford university research archive OpenDOAR OpenDOAR_1064
1 dedup::1aa7a8773e6a7fdacbcedf9999009a38 opendoar____::191f8f858acda435ae0daf994e2a72c2 8648 digital commons@georgia southern OpenDOAR OpenDOAR_8648
2 dedup::31bceb0c3e2a260593e1e36655ebcee4 opendoar____::d5776aeecb3c45ab15adce6f5cb355f3 9713 materials data repository OpenDOAR OpenDOAR_9713
3 dedup::e37b08dd3015330dcbb5d6663667b8b8 opendoar____::18997733ec258a9fcaf239cc55d53363 427 digital repository at the university of maryland OpenDOAR OpenDOAR_427
4 dedup::2841194266115ac1cc04d19630cde46b re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5 r3d100011189 PRISM: University of Calgary's Digital Repository re3data re3data_r3d100011189
In [8]:
dup.describe()
Out[8]:
dedup_id duplicate_id original_id name source unique_id
count 4617 4617 4617 4617 4617 4617
unique 2191 4617 4159 3968 4 4617
top dedup::75e33da9b103b7b91dcd8da0abe1354b opendoar____::2290a7385ed77cc5592dc2153229f082 2399 UPN JATIM REPOSITORY roar OpenDOAR_1064
freq 5 1 3 4 1977 1

Assessing duplicates across registries

In [9]:
dup_grouped = dup.groupby('dedup_id').aggregate(list)
dup_grouped['source_set'] = dup_grouped.source.map(set)
In [10]:
dup_grouped[dup_grouped.source_set.str.len() == 4].count()
Out[10]:
duplicate_id    6
original_id     6
name            6
source          6
unique_id       6
source_set      6
dtype: int64
In [11]:
dup_grouped[dup_grouped.source_set.str.len() == 3].count()
Out[11]:
duplicate_id    60
original_id     60
name            60
source          60
unique_id       60
source_set      60
dtype: int64
In [12]:
dup_grouped[dup_grouped.source_set.str.len() == 2].count()
Out[12]:
duplicate_id    1986
original_id     1986
name            1986
source          1986
unique_id       1986
source_set      1986
dtype: int64
In [13]:
dup_grouped[dup_grouped.source_set.str.len() == 1].count()
Out[13]:
duplicate_id    139
original_id     139
name            139
source          139
unique_id       139
source_set      139
dtype: int64

Assessing duplicates within registries

In [65]:
opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()
opendoar_dup[opendoar_dup.duplicate_id > 1].aggregate(['count', 'sum'])
Out[65]:
duplicate_id original_id name source unique_id
count 28 28 28 28 28
sum 58 58 58 58 58
In [64]:
re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()
re3data_dup[re3data_dup.duplicate_id > 1].aggregate(['count', 'sum'])
Out[64]:
duplicate_id original_id name source unique_id
count 3 3 3 3 3
sum 6 6 6 6 6
In [63]:
roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()
roar_dup[roar_dup.duplicate_id > 1].aggregate(['count', 'sum'])
Out[63]:
duplicate_id original_id name source unique_id
count 249 249 249 249 249
sum 518 518 518 518 518
In [53]:
fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()
fairsharing_dup[fairsharing_dup.duplicate_id > 1].count()
Out[53]:
duplicate_id    0
original_id     0
name            0
source          0
unique_id       0
dtype: int64

Isolating duplicates within a registry

In [14]:
dup_within = dup.groupby('dedup_id').aggregate(list)
dup_within['source_set'] = dup_within.source.map(set)
dup_within = dup_within[dup_within.source_set.str.len() == 1]
dup_within.head()
Out[14]:
duplicate_id original_id name source unique_id source_set
dedup_id
dedup::000871c1fc726f0b52dc86a4eeb027de [4612, 4649] [4612, 4649] [IIT Bombay Institutional Repository, IIT Bomb... [roar, roar] [roar_4612, roar_4649] {roar}
dedup::0163cceb20f5ca7b313419c068abd9dc [7943, 8003] [7943, 8003] [EPrints@NIRT Library Welcomes! - EPrints@NITR... [roar, roar] [roar_7943, roar_8003] {roar}
dedup::028ee724157b05d04e7bdcf237d12e60 [2670, 2698, 2741] [2670, 2698, 2741] [HSF Brage Open Research Archive, HSF Brage Op... [roar, roar, roar] [roar_2670, roar_2698, roar_2741] {roar}
dedup::03593ce517feac573fdaafa6dcedef61 [4393, 4394] [4393, 4394] [Institutional Repository of Kunming Institute... [roar, roar] [roar_4393, roar_4394] {roar}
dedup::03e0704b5690a2dee1861dc3ad3316c9 [1019, 5550] [1019, 5550] [PolyU Institutional Repository, PolyU Institu... [roar, roar] [roar_1019, roar_5550] {roar}
In [15]:
dup_within['source_set'] = dup_within.source_set.map(set.pop)
dup_within.head()
Out[15]:
duplicate_id original_id name source unique_id source_set
dedup_id
dedup::000871c1fc726f0b52dc86a4eeb027de [4612, 4649] [4612, 4649] [IIT Bombay Institutional Repository, IIT Bomb... [roar, roar] [roar_4612, roar_4649] roar
dedup::0163cceb20f5ca7b313419c068abd9dc [7943, 8003] [7943, 8003] [EPrints@NIRT Library Welcomes! - EPrints@NITR... [roar, roar] [roar_7943, roar_8003] roar
dedup::028ee724157b05d04e7bdcf237d12e60 [2670, 2698, 2741] [2670, 2698, 2741] [HSF Brage Open Research Archive, HSF Brage Op... [roar, roar, roar] [roar_2670, roar_2698, roar_2741] roar
dedup::03593ce517feac573fdaafa6dcedef61 [4393, 4394] [4393, 4394] [Institutional Repository of Kunming Institute... [roar, roar] [roar_4393, roar_4394] roar
dedup::03e0704b5690a2dee1861dc3ad3316c9 [1019, 5550] [1019, 5550] [PolyU Institutional Repository, PolyU Institu... [roar, roar] [roar_1019, roar_5550] roar
In [16]:
dup_within.groupby('source_set').count()
Out[16]:
duplicate_id original_id name source unique_id
source_set
OpenDOAR 16 16 16 16 16
re3data 2 2 2 2 2
roar 121 121 121 121 121
In [17]:
dup_within = dup[dup.dedup_id.isin(dup_within.index)]
dup_within
Out[17]:
dedup_id duplicate_id original_id name source unique_id
28 dedup::d2ddea18f00665ce8623e36bd4e3c7c5 8237 8237 AIR | Archivio Istituzionale della Ricerca roar roar_8237
31 dedup::4c5bcfec8584af0d967f1ab10179ca4b 2820 2820 USU Repository: Open Access Repository roar roar_2820
46 dedup::c2ae5cb2426d96ed19a50b0b7d7c8e11 9487 9487 IR at NRF: Home roar roar_9487
53 dedup::1c65cef3dfd1e00c0b03923a1c591db4 1241 1241 Swansea Metropolitan University Repository roar roar_1241
59 dedup::4217ec5d78c4bc4e5bd006783482441f 15142 15142 Repositorio Institucional roar roar_15142
... ... ... ... ... ... ...
4560 dedup::fc394e9935fbd62c8aedc372464e1965 7161 7161 Welcome to IR@NPL roar roar_7161
4586 dedup::000871c1fc726f0b52dc86a4eeb027de 4649 4649 IIT Bombay Institutional Repository roar roar_4649
4587 dedup::72c288a828485e5b1d4c52910d106734 16867 16867 Chung Shan Medical University Institutional Re... roar roar_16867
4598 dedup::0163cceb20f5ca7b313419c068abd9dc 8003 8003 EPrints@NIRT Library Welcomes! - EPrints@NIRT roar roar_8003
4608 dedup::2aeb1a8f8475cef63900be5d0780e872 15471 15471 Repository STIE Nobel Indonesia roar roar_15471

287 rows × 6 columns

Isolating duplicates across registries (hybrid)

In [18]:
dup_across = dup[~dup.dedup_id.isin(dup_within.dedup_id)]
dup_across = dup_across.groupby('dedup_id').aggregate(list)
dup_across['source_set'] = dup_across.source.map(set)

dup_hybrid = dup_across[dup_across.source_set.str.len() < dup_across.source.str.len()]
dup_hybrid = dup[dup.dedup_id.isin(dup_hybrid.index)]
dup_hybrid
Out[18]:
dedup_id duplicate_id original_id name source unique_id
12 dedup::471c50ad1a156d7256eddfd747d77931 opendoar____::6351bf9dce654515bf1ddbd6426dfa97 1996 ehtc repositorio institucional OpenDOAR OpenDOAR_1996
21 dedup::69dafe8b58066478aea48f3d0f384820 2312 2312 Göteborgs universitets publikationer - e-publi... roar roar_2312
26 dedup::8f822ac814829da24a7065b8131bdf47 opendoar____::a34bacf839b923770b2c360eefa26748 1035 kitami institute of technology repository OpenDOAR OpenDOAR_1035
41 dedup::63a99723ebb3af94d52b474c3b21dbe1 5779 5779 Sanok Digital Library roar roar_5779
47 dedup::82680bfec0fa08346c1b10d30a3e3d4a 11212 11212 Publication Server of the Wuppertal Institute roar roar_11212
... ... ... ... ... ... ...
4601 dedup::7810ccd41bf26faaa2c4e1f20db70a71 3172 3172 Tesis Electrónicas UACh roar roar_3172
4602 dedup::e655c7716a4b3ea67f48c6322fc42ed6 opendoar____::52c5189391854c93e8a0e1326e56c14f 1637 vtext digital repository OpenDOAR OpenDOAR_1637
4603 dedup::5ebe5626b9f1cd89fbb9f665a527591f 16225 16225 Necmettin Erbakan University Institutional Rep... roar roar_16225
4605 dedup::ec0bfd000f253eff3acb1043e1c06979 opendoar____::aa2a77371374094fe9e0bc1de3f94ed9 1829 npue ir OpenDOAR OpenDOAR_1829
4610 dedup::1c7836dbabd12c458d20e3b35633733a 14616 14616 SOAR@USA: Scholarship and Open Access Repository roar roar_14616

440 rows × 6 columns

Isolating duplicates across registries (pure)

In [19]:
dup_across = dup_across[dup_across.source_set.str.len() == dup_across.source.str.len()]
dup_across = dup[dup.dedup_id.isin(dup_across.index)]
dup_across
# dup[dup.dedup_id.isin(dup_across.index)]
Out[19]:
dedup_id duplicate_id original_id name source unique_id
0 dedup::860320be12a1c050cd7731794e231bd3 opendoar____::2290a7385ed77cc5592dc2153229f082 1064 oxford university research archive OpenDOAR OpenDOAR_1064
1 dedup::1aa7a8773e6a7fdacbcedf9999009a38 opendoar____::191f8f858acda435ae0daf994e2a72c2 8648 digital commons@georgia southern OpenDOAR OpenDOAR_8648
2 dedup::31bceb0c3e2a260593e1e36655ebcee4 opendoar____::d5776aeecb3c45ab15adce6f5cb355f3 9713 materials data repository OpenDOAR OpenDOAR_9713
3 dedup::e37b08dd3015330dcbb5d6663667b8b8 opendoar____::18997733ec258a9fcaf239cc55d53363 427 digital repository at the university of maryland OpenDOAR OpenDOAR_427
4 dedup::2841194266115ac1cc04d19630cde46b re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5 r3d100011189 PRISM: University of Calgary's Digital Repository re3data re3data_r3d100011189
... ... ... ... ... ... ...
4612 dedup::5ef0b4eba35ab2d6180b0bca7e46b6f9 475 475 Ecological Restoration Institute - Northern Ar... roar roar_475
4613 dedup::66e8d052ec2230c66bd11ee6b5a0e3c8 14199 14199 Repositori STKIP PGRI Sumenep roar roar_14199
4614 dedup::1216a1bca4361c39d1d77965c5d95ee3 4960 4960 Virtual Archive of Polish Armenians roar roar_4960
4615 dedup::1408358fe6a7f9327dd41a5651ac284c 13824 13824 Digital Commons @ New Jersey Institute of Tech... roar roar_13824
4616 dedup::5cc33dfe7e069a757ca0fcbe6b95c89e opendoar____::d8a4e572d866aa45da78418d9d2ff9f9 4351 odu digital commons OpenDOAR OpenDOAR_4351

3890 rows × 6 columns

Double check partitions

In [20]:
dup.count()
Out[20]:
dedup_id        4617
duplicate_id    4617
original_id     4617
name            4617
source          4617
unique_id       4617
dtype: int64
In [21]:
dup_across.count() + dup_within.count() + dup_hybrid.count()
Out[21]:
dedup_id        4617
duplicate_id    4617
original_id     4617
name            4617
source          4617
unique_id       4617
dtype: int64
In [22]:
dup_within.groupby('dedup_id').ngroups + dup_across.groupby('dedup_id').ngroups + dup_hybrid.groupby('dedup_id').ngroups
Out[22]:
2191
In [23]:
dup.groupby('dedup_id').ngroups
Out[23]:
2191

Joining information

In [24]:
dup_within = dup_within.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_within = dup_within.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_within = dup_within.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_within = dup_within.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_within.head()
Out[24]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects_phrases OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
0 dedup::d2ddea18f00665ce8623e36bd4e3c7c5 8237 8237 AIR | Archivio Istituzionale della Ricerca roar roar_8237 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8237 17 archive 5268 NaN NaN disk0/00/00/82/37 2014-05-15 11:23:30 2014-05-19 05:42:47 2014-05-15 11:23:30 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://air.unimi.it AIR | Archivio Istituzionale della Ricerca http://air.unimi.it/dspace-oai/request NaN NaN NaN AIR (Archivio Istituzionale della ricerca) is ... FALSE FALSE TRUE Università degli Studi di Milano http://www.unimi.it it Milan 45.46 9.1947 dspace geoname_2_IT other NaN 2014-05-04 17:40:53 NaN NaN 0 0 0 99 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,6... NaN NaN NaN NaN celestial 1596 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_8237
1 dedup::4c5bcfec8584af0d967f1ab10179ca4b 2820 2820 USU Repository: Open Access Repository roar roar_2820 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2820 525 archive 65 NaN NaN disk0/00/00/28/20 2010-07-29 01:40:27 2012-01-19 11:37:49 2010-07-29 01:40:27 institutional 2372 NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://repository.usu.ac.id USU Repository: Open Access Repository http://repository.usu.ac.id/oai/request NaN http://repository.usu.ac.id/feed/rss_2.0/site NaN Comprises of works by and/or about the univers... TRUE TRUE FALSE [USU Library, University of Sumatera Utara] [http://library.usu.ac.id, http://www.usu.ac.id] id Medan 3.5595 98.6572 dspace geoname_2_ID other NaN 2010-01-15 10:09:25 NaN NaN 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,52,... NaN NaN NaN NaN [roarmap, opendoar, celestial] [283, 1717, 2101] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_2820
2 dedup::c2ae5cb2426d96ed19a50b0b7d7c8e11 9487 9487 IR at NRF: Home roar roar_9487 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 9487 16 archive 6458 NaN NaN disk0/00/00/94/87 2015-05-15 14:03:55 2016-03-21 20:21:02 2015-05-15 14:03:55 multi NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://ir.nrf.ac.za/ IR at NRF: Home NaN NaN NaN NaN The NRF receives its mandate from the National... TRUE TRUE FALSE National Research Foundation of South Africa http://www.nrf.ac.za/ za Pretoria NaN NaN dspace geoname_2_ZA other [B1, AS, AI] 2015-02-10 06:35:50 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roarmap NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_9487
3 dedup::1c65cef3dfd1e00c0b03923a1c591db4 1241 1241 Swansea Metropolitan University Repository roar roar_1241 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1241 583 archive 1 NaN NaN disk0/00/00/12/41 2010-01-06 13:45:32 2011-07-18 05:57:23 2010-01-06 13:45:32 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://dspace.smu.ac.uk/dspace/ Swansea Metropolitan University Repository http://dspace.smu.ac.uk/dspace-oai/request NaN NaN NaN Users may set up RSS feeds to be alerted to ne... NaN NaN NaN Swansea Metropolitan University http://www.smu.ac.uk/ gb Swansea 51.6144 -3.8727 dspace geoname_2_GB other NaN 2008-05-15 11:29:17 NaN NaN 0 0 0 135 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,135,13... 0 0 0 0 [opendoar, celestial] [1779, 1627] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_1241
4 dedup::4217ec5d78c4bc4e5bd006783482441f 15142 15142 Repositorio Institucional roar roar_15142 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 15142 11 archive 12132 NaN NaN disk0/00/01/51/42 2020-08-08 12:35:50 2021-01-25 22:45:10 2020-08-08 12:35:50 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://repositorio.undar.edu.pe/ Repositorio Institucional http://repositorio.undar.edu.pe/ NaN NaN NaN NaN FALSE FALSE FALSE NaN NaN pe huanuco -9.9269 -76.2396 dspace geoname_2_PE other NaN 2019-09-02 21:20:31 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN opendoar http://v2.sherpa.ac.uk/id/repository/4422 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_15142
In [25]:
dup_hybrid = dup_hybrid.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_hybrid.head()
Out[25]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects_phrases OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
0 dedup::471c50ad1a156d7256eddfd747d77931 opendoar____::6351bf9dce654515bf1ddbd6426dfa97 1996 ehtc repositorio institucional OpenDOAR OpenDOAR_1996 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1996 {"name": "ehtc repositorio institucional", "la... [] http://www.repositorio.ehtc.cu/jspui/ this site provides access to the hospitality a... institutional ["es"] 2019-10-17 14:34:31 2010-12-01 11:11:57 [business and economics, education] [journal_articles, conference_and_workshop_pap... [{'name': 'escuela de hotelería y turismo de c... [] {"name": "dspace", "version": "1.6.2"} NaN yes OpenDOAR_1996 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 dedup::69dafe8b58066478aea48f3d0f384820 2312 2312 Göteborgs universitets publikationer - e-publi... roar roar_2312 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2312 736 archive 1 NaN NaN disk0/00/00/23/12 2010-01-14 12:10:06 2011-07-18 06:01:08 2010-01-14 12:10:06 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://gupea.ub.gu.se/dspace/index.jsp Göteborgs universitets publikationer - e-publi... http://gupea.ub.gu.se/dspace-oai/request NaN NaN NaN This is an institutional repository providing ... FALSE FALSE FALSE Göteborgs Universitet http://www.gu.se/ se NaN 57.6975 11.9608 dspace NaN other NaN 2005-06-07 12:57:08 NaN NaN 0 0 0 96 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... NaN NaN NaN NaN [opendoar, celestial] [1832, 1149] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_2312
2 dedup::8f822ac814829da24a7065b8131bdf47 opendoar____::a34bacf839b923770b2c360eefa26748 1035 kitami institute of technology repository OpenDOAR OpenDOAR_1035 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1035 {"name": "kitami institute of technology repos... [{'name': '北見工業大学学術機関リポジトリ kit-r', 'language':... https://kitami-it.repo.nii.ac.jp/ this site is a university repository providing... institutional ["ja", "en"] 2020-09-09 11:57:56 2007-10-09 09:09:40 [technology general] [journal_articles, unpub_reports_and_working_p... [{'name': 'kitami institute of technology', 'a... [] {"name": "weko", "version": ""} http://kitami-it.repo.nii.ac.jp/oai yes OpenDOAR_1035 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 dedup::63a99723ebb3af94d52b474c3b21dbe1 5779 5779 Sanok Digital Library roar roar_5779 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5779 9 archive 8 NaN NaN disk0/00/00/57/79 2012-12-12 04:54:20 2012-12-15 02:36:20 2012-12-12 04:54:20 other NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://sanockabibliotekacyfrowa.pl/dlibra Sanok Digital Library http://sanockabibliotekacyfrowa.pl/dlibra/oai-... NaN NaN NaN This site provides access to the digitised col... NaN NaN NaN Digital-Center http://www.digital-center.pl/ pl NaN 52.4872 16.8493 NaN geoname_2_PL other NaN 2012-08-05 15:12:12 NaN NaN 0 0 0 19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,19,19... NaN NaN NaN NaN [opendoar, celestial] [2545, 5072] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_5779
4 dedup::82680bfec0fa08346c1b10d30a3e3d4a 11212 11212 Publication Server of the Wuppertal Institute roar roar_11212 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11212 12 archive 5611 NaN NaN disk0/00/01/12/12 2016-05-04 11:37:14 2016-05-07 01:37:18 2016-05-04 11:37:14 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN https://epub.wupperinst.org/home Publication Server of the Wuppertal Institute\... https://epub.wupperinst.org/oai NaN https://epub.wupperinst.org/rss NaN \n\nOn this Publication Server of the Wupperta... TRUE TRUE FALSE Wuppertal Institut für Klima, Umwelt, Energie http://wupperinst.org/ de Wuppertal 51.2562 7.1508 opus geoname_2_DE other [HB, GE, T1] 2016-04-28 13:58:38 NaN please delete ID 5891 NaN NaN NaN NaN NaN NaN NaN NaN NaN [opendoar, celestial] [2539, 6112] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_11212
In [26]:
dup_across = dup_across.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_across = dup_across.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_across = dup_across.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_across = dup_across.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_across.head()
Out[26]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects_phrases OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
0 dedup::860320be12a1c050cd7731794e231bd3 opendoar____::2290a7385ed77cc5592dc2153229f082 1064 oxford university research archive OpenDOAR OpenDOAR_1064 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1064 {"name": "oxford university research archive",... [{'acronym': 'ora'}] http://ora.ox.ac.uk this site provides access to the collected res... institutional ["zh", "nl", "en", "fr", "de", "it", "ja", "pt... 2021-09-13 13:35:44 2007-10-10 16:16:02 [multidisciplinary] [journal_articles, conference_and_workshop_pap... [{'name': 'university of oxford', 'alternative... [{"policy_url": "https://libguides.bodleian.ox... {"name": "fedora", "version": "4.6.2"} https://ora.ox.ac.uk/oai2 yes OpenDOAR_1064 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 dedup::1aa7a8773e6a7fdacbcedf9999009a38 opendoar____::191f8f858acda435ae0daf994e2a72c2 8648 digital commons@georgia southern OpenDOAR OpenDOAR_8648 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8648 {"name": "digital commons@georgia southern", "... [] https://digitalcommons.georgiasouthern.edu this site provides access to the research outp... institutional ["en"] 2021-02-18 18:13:34 2019-09-28 04:24:47 [multidisciplinary] [journal_articles, conference_and_workshop_pap... [{'name': 'georgia southern university', 'alte... [] {"name": "digital_commons", "version": ""} https://digitalcommons.georgiasouthern.edu/do/oai yes OpenDOAR_8648 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 dedup::31bceb0c3e2a260593e1e36655ebcee4 opendoar____::d5776aeecb3c45ab15adce6f5cb355f3 9713 materials data repository OpenDOAR OpenDOAR_9713 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 9713 {"name": "materials data repository", "languag... [{'acronym': 'mdr'}] https://mdr.nims.go.jp mdr : materials data repository is a data repo... institutional ["en", "ja"] 2021-05-21 18:04:32 2020-07-13 10:09:55 [science general] [journal_articles, conference_and_workshop_pap... [{'name': 'national institute for materials sc... [] {"name": "fedora", "version": ""} https://mdr.nims.go.jp/catalog/oai yes OpenDOAR_9713 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 dedup::e37b08dd3015330dcbb5d6663667b8b8 opendoar____::18997733ec258a9fcaf239cc55d53363 427 digital repository at the university of maryland OpenDOAR OpenDOAR_427 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 427 {"name": "digital repository at the university... [{'acronym': 'drum'}] http://drum.lib.umd.edu/ this site is a university repository providing... institutional ["en"] 2021-09-13 13:35:39 2006-08-04 09:09:20 [multidisciplinary] [journal_articles, theses_and_dissertations, u... [{'name': 'university of maryland', 'alternati... [{"policy_url": "http://drum.lib.umd.edu/page/... {"name": "dspace", "version": "4.1.0"} http://drum.lib.umd.edu/oai/request yes OpenDOAR_427 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 dedup::2841194266115ac1cc04d19630cde46b re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5 r3d100011189 PRISM: University of Calgary's Digital Repository re3data re3data_r3d100011189 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN r3d100011189 PRISM: University of Calgary's Digital Repository eng [] https://prism.ucalgary.ca/ [OpenDOAR:7771] ["digitize@ucalgary.ca", "kmeranji@ucalgary.ca"] PRISM is a digital archive of the University o... eng [institutional] {"size": "", "updatedp": ""} NaN NaN ["eng"] [1 Humanities and Social Sciences, 11 Humaniti... NaN [Audiovisual data, Images, Standard office doc... [dataProvider] [multidisciplinary] [{'institutionName': 'University of Calgary, L... [{"policyName": "Open Access Mandate", "policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Submission Policy"... ["DSpace"] NaN [] ["DOI", "hdl"] NaN [] no yes [] [] {"syndication": "http://prism.ucalgary.ca/feed... NaN 2014-10-20 2020-01-09 re3data_r3d100011189 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [27]:
dup_within = dup_within.groupby('dedup_id').aggregate(list).reset_index()
dup_within['source_set'] = dup_within.source.map(set)
dup_within.head()
<ipython-input-27-3881fa0a0224>:1: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider using pd.concat instead.  To get a de-fragmented frame, use `newframe = frame.copy()`

<ipython-input-27-3881fa0a0224>:2: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider using pd.concat instead.  To get a de-fragmented frame, use `newframe = frame.copy()`

Out[27]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects_phrases OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id source_set
0 dedup::000871c1fc726f0b52dc86a4eeb027de [4612, 4649] [4612, 4649] [IIT Bombay Institutional Repository, IIT Bomb... [roar, roar] [roar_4612, roar_4649] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [4612, 4649] [28, 8] [archive, archive] [1380, 1380] [nan, nan] [nan, nan] [disk0/00/00/46/12, disk0/00/00/46/49] [2012-01-08 03:17:02, 2012-02-05 13:57:01] [2012-04-16 10:53:04, 2012-04-16 10:39:58] [2012-01-08 03:17:02, 2012-02-05 13:57:01] [institutional, institutional] [nan, nan] [nan, nan] [show, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [http://dspace.library.iitb.ac.in/jspui/, http... [IIT Bombay Institutional Repository, IIT Bomb... [http://dspace.library.iitb.ac.in/oai/request,... [nan, nan] [http://dspace.library.iitb.ac.in/xmlui/feed/a... [nan, nan] [nan, nan] [TRUE, TRUE] [TRUE, TRUE] [TRUE, FALSE] [IIT Bombay, IIT Bombay] [http://www.iitb.ac.in, http://www.iitb.ac.in] [in, in] [Mumbai, Mumbai] [19.133, 19.133] [72.9166, 72.9166] [dspace, dspace] [geoname_2_IN, geoname_2_IN] [other, other] TP, TN, TJ, TH, TK, TD, TA], [TA, T1 [2011-12-15 09:01:35, 2012-01-05 12:09:37] [nan, nan] [nan, nan] [0, nan] [0, nan] [0, nan] [99, nan] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,95,... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [celestial, celestial] [4790, 4789] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_4612, roar_4649] {roar}
1 dedup::0163cceb20f5ca7b313419c068abd9dc [7943, 8003] [7943, 8003] [EPrints@NIRT Library Welcomes! - EPrints@NITR... [roar, roar] [roar_7943, roar_8003] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [7943, 8003] [16, 19] [archive, archive] [4963, 5023] [nan, nan] [nan, nan] [disk0/00/00/79/43, disk0/00/00/80/03] [2014-03-11 11:54:06, 2014-03-30 18:13:01] [2014-05-08 13:07:12, 2014-05-08 12:55:41] [2014-03-11 11:54:06, 2014-03-30 18:13:01] [institutional, institutional] [nan, nan] [nan, nan] [show, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [http://eprints.nirt.res.in/, http://eprints.n... [EPrints@NIRT Library Welcomes! - EPrints@NITR... [http://eprints.nirt.res.in/cgi/oai2, http://e... [nan, nan] [http://eprints.nirt.res.in/cgi/latest_tool?ou... [nan, nan] [This is the Institutional Repository of the N... [TRUE, FALSE] [TRUE, FALSE] [FALSE, FALSE] [National Institute for Research in Tuberculos... [http://www.nirt.res.in/, http://www.nirt.res.in] [in, in] [Chennai, Chennai (Madras)] [nan, 13] [nan, 80] [eprints, eprints] [geoname_2_IN, geoname_2_IN] [3.3.15 eps, 3.3.15 eps] RB, RM], [R1, RZ [2014-03-07 15:07:45, 2014-03-19 07:05:04] [The National Institute for Research in Tuberc... [nan, Please include "Tuberculosis" as a Speci... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [[opendoar, celestial], celestial] [[5410, 2725], 5430] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_7943, roar_8003] {roar}
2 dedup::028ee724157b05d04e7bdcf237d12e60 [2670, 2698, 2741] [2670, 2698, 2741] [HSF Brage Open Research Archive, HSF Brage Op... [roar, roar, roar] [roar_2670, roar_2698, roar_2741] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [2670, 2698, 2741] [470, 317, 231] [archive, archive, archive] [235, 8, 8] [nan, nan, nan] [nan, nan, nan] [disk0/00/00/26/70, disk0/00/00/26/98, disk0/0... [2010-05-04 02:19:51, 2010-05-13 11:01:53, 201... [2011-07-18 06:02:42, 2011-07-06 08:24:10, 201... [2010-05-04 02:19:51, 2010-05-13 11:01:53, 201... [institutional, institutional, institutional] [nan, nan, nan] [nan, nan, nan] [show, show, show] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [0, 0, 0] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [http://brage.bibsys.no/hsf/, http://brage.bib... [HSF Brage Open Research Archive, HSF Brage Op... [http://oai.bibsys.no/oai/repository/nora_hsf_... [http://brage.bibsys.no/hsf/?locale=en, nan, nan] [nan, nan, nan] [nan, nan, nan] [This site provides access to the research out... [TRUE, FALSE, FALSE] [TRUE, FALSE, FALSE] [FALSE, FALSE, FALSE] [Sogn og Fjordane University College, Høgskule... [http://www.hisf.no/, http://www.hisf.no/, htt... [no, no, no] [Sogndal, nan, nan] [61.2174, 61.2174, 60.3904] [7.1082, 7.1082, 5.3332] [dspace, dspace, dspace] [geoname_2_NO, nan, nan] [other, other, other] [nan, nan, nan] [2010-04-06 13:51:52, 2010-05-09 15:12:16, 201... [nan, nan, nan] [nan, nan, nan] [0, nan, nan] [0, nan, nan] [0, nan, nan] [50, nan, nan] [0,0,1,1,1,4,4,6,6,7,8,11,12,14,15,17,18,18,18... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [[opendoar, celestial], opendoar, opendoar] [[2426, 1781], 1781, 1807] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [roar_2670, roar_2698, roar_2741] {roar}
3 dedup::03593ce517feac573fdaafa6dcedef61 [4393, 4394] [4393, 4394] [Institutional Repository of Kunming Institute... [roar, roar] [roar_4393, roar_4394] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [4393, 4394] [14, 14] [archive, archive] [986, 986] [nan, nan] [nan, nan] [disk0/00/00/43/93, disk0/00/00/43/94] [2011-11-09 23:14:52, 2011-11-09 23:14:46] [2012-02-06 06:58:40, 2012-02-06 06:58:41] [2011-11-09 23:14:52, 2011-11-09 23:14:46] [institutional, institutional] [nan, nan] [nan, nan] [show, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [http://159.226.149.42:8088/, http://159.226.1... [Institutional Repository of Kunming Institute... [http://159.226.149.42:8088/casirgrid-oai/requ... [nan, nan] [nan, nan] [nan, nan] [This site provides access to the output of th... [TRUE, TRUE] [TRUE, TRUE] [FALSE, FALSE] [ Kunming Institute of Zoology Chinese Academy... [http://www.kiz.ac.cn/, http://www.kiz.ac.cn/] [cn, cn] [kunming, kunming] [25.0416, 25.0416] [102.755, 102.755] [dspace, dspace] [geoname_2_CN, geoname_2_CN] [other, other] [nan, nan] [2010-07-22 16:00:13, 2010-07-22 16:00:13] [nan, nan] [nan, nan] [0, 0] [0, 0] [0, 0] [100, 100] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [celestial, celestial] [4715, 4715] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_4393, roar_4394] {roar}
4 dedup::03e0704b5690a2dee1861dc3ad3316c9 [1019, 5550] [1019, 5550] [PolyU Institutional Repository, PolyU Institu... [roar, roar] [roar_1019, roar_5550] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [1019, 5550] [526, 9] [archive, archive] [1, 8] [nan, nan] [nan, nan] [disk0/00/00/10/19, disk0/00/00/55/50] [2010-01-06 13:45:03, 2012-12-12 01:25:48] [2012-01-19 11:35:09, 2012-12-17 06:53:14] [2010-01-06 13:45:03, 2012-12-12 01:25:48] [institutional, institutional] [nan, nan] [nan, nan] [show, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [0, nan] [nan, nan] [nan, nan] [nan, nan] [http://repository.lib.polyu.edu.hk/, http://r... [PolyU Institutional Repository, PolyU Institu... [http://repository.lib.polyu.edu.hk/oai/reques... [nan, nan] [nan, nan] [nan, nan] [nan, This is an Institutional repository prov... [TRUE, nan] [TRUE, nan] [nan, nan] [The Hong Kong Polytechnic University Pao Yue-... [http://www.lib.polyu.edu.hk, http://www.polyu... [hk, cn] [Hong Kong, nan] [22.25, 22.3964] [114.167, 114.109] [dspace, dspace] [geoname_2_HK, geoname_2_CN] [other, other] [nan, nan] [2008-10-30 07:50:38, 2012-07-01 15:13:40] [nan, nan] [nan, nan] [0, 0] [0, 0] [0, 0] [86, 86] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,54,71,80,... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [[roarmap, opendoar, celestial], [opendoar, ce... 193, 1456, 1441], [1456, 1441 [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_1019, roar_5550] {roar}
In [28]:
dup_hybrid = dup_hybrid.groupby('dedup_id').aggregate(list).reset_index()
dup_hybrid['source_set'] = dup_hybrid.source.map(set)
dup_hybrid.head()
<ipython-input-28-89649d18870f>:1: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider using pd.concat instead.  To get a de-fragmented frame, use `newframe = frame.copy()`

<ipython-input-28-89649d18870f>:2: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider using pd.concat instead.  To get a de-fragmented frame, use `newframe = frame.copy()`

Out[28]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects_phrases OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id source_set
0 dedup::01b6397888c09d84f3dc89d807aa1004 [4745, opendoar____::a9365bd906e11324065c35be4... [4745, 2429, 4320] [RU-Económicas, ru-económicas, ru económicas] [roar, OpenDOAR, OpenDOAR] [roar_4745, OpenDOAR_2429, OpenDOAR_4320] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, 2429, 4320] [nan, {"name": "ru-econ\u00f3micas", "language... [nan, [], []] [nan, http://ru.iiec.unam.mx/, http://ru.iiec.... [nan, this site provides access to the researc... [nan, institutional, institutional] [nan, ["es"], ["es"]] [nan, 2021-09-13 13:35:56, 2021-09-13 13:36:17] [nan, 2012-02-28 12:12:09, 2019-02-19 10:51:49] [nan, [multidisciplinary], [business and econo... [nan, [journal_articles, theses_and_dissertati... [nan, [{'name': 'universidad nacional autónoma... [nan, [{"policy_url": "http://ru.iiec.unam.mx/... [nan, {"name": "eprints", "version": "3.3.15"}... [nan, http://ru.iiec.unam.mx/cgi/oai2, nan] [nan, yes, yes] [nan, OpenDOAR_2429, OpenDOAR_4320] [4745, nan, nan] [31, nan, nan] [archive, nan, nan] [1447, nan, nan] [nan, nan, nan] [nan, nan, nan] [disk0/00/00/47/45, nan, nan] [2012-02-05 14:27:15, nan, nan] [2012-04-16 10:34:36, nan, nan] [2012-02-05 14:27:15, nan, nan] [institutional, nan, nan] [nan, nan, nan] [nan, nan, nan] [show, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [http://ru.iiec.unam.mx, nan, nan] [RU-Económicas, nan, nan] [http://ru.iiec.unam.mx/cgi/oai2, nan, nan] [nan, nan, nan] [http://ru.iiec.unam.mx/cgi/latest_tool?output... [nan, nan, nan] [Productos académicos del Instituto de Investi... [TRUE, nan, nan] [TRUE, nan, nan] [TRUE, nan, nan] [Instituto de Investigaciones Económicas UNAM,... [http://www.iiec.unam.mx/, nan, nan] [mx, nan, nan] [Mexico, nan, nan] [19.3162, nan, nan] [-99.1799, nan, nan] [eprints, nan, nan] [geoname_2_MX, nan, nan] [3.3.15 eps, nan, nan] [[GF, HJ, HT, HB, HM, HC, HX, HN, H1, G1, T1, ... [2012-02-03 05:18:16, nan, nan] [nan, nan, nan] [nan, nan, nan] [0, nan, nan] [0, nan, nan] [0, nan, nan] [94, nan, nan] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,7... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [[opendoar, celestial], nan, nan] [[2429, 4818], nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [roar_4745, nan, nan] {roar, OpenDOAR}
1 dedup::03db60c2331018b18c4166c1787072fe [opendoar____::78bc62d08a9a0b9b0b9c0ad339ef82d... [3087, 4500, 8504] [landmark university repository, landmark univ... [OpenDOAR, OpenDOAR, roar] [OpenDOAR_3087, OpenDOAR_4500, roar_8504] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [3087, 4500, nan] [{"name": "landmark university repository", "l... [[], [], nan] [http://eprints.lmu.edu.ng/, http://eprints.lm... [this site provides access to the multi-discip... [institutional, institutional, nan] [["en"], ["en"], nan] [2021-09-13 13:36:06, 2021-02-18 18:01:12, nan] [2014-06-16 13:36:00, 2019-03-26 14:07:30, nan] [[multidisciplinary], [multidisciplinary], nan] [[journal_articles], [journal_articles, biblio... [[{'name': 'landmark university', 'alternative... [[{"policy_url": "http://eprints.lmu.edu.ng/po... [{"name": "eprints", "version": "3.3.12"}, {"n... [http://eprints.lmu.edu.ng/cgi/oai2, nan, nan] [yes, yes, nan] [OpenDOAR_3087, OpenDOAR_4500, nan] [nan, nan, 8504] [nan, nan, 12] [nan, nan, archive] [nan, nan, 5459] [nan, nan, nan] [nan, nan, nan] [nan, nan, disk0/00/00/85/04] [nan, nan, 2014-06-24 10:14:07] [nan, nan, 2014-06-28 01:38:49] [nan, nan, 2014-06-24 10:14:07] [nan, nan, institutional] [nan, nan, nan] [nan, nan, nan] [nan, nan, show] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, http://eprints.lmu.edu.ng] [nan, nan, Landmark University Repository] [nan, nan, http://eprints.lmu.edu.ng/cgi/oai] [nan, nan, nan] [nan, nan, http://eprints.lmu.edu.ng/cgi/lates... [nan, nan, nan] [nan, nan, nan] [nan, nan, TRUE] [nan, nan, TRUE] [nan, nan, TRUE] [nan, nan, Landmark University] [nan, nan, http://lmu.edu.ng] [nan, nan, ng] [nan, nan, Omu-Aran] [nan, nan, 8.12421] [nan, nan, 5.09488] [nan, nan, eprints] [nan, nan, geoname_2_NG] [nan, nan, 3.3.16 eps] [nan, nan, nan] [nan, nan, 2014-06-07 22:16:23] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, [opendoar, celestial]] [nan, nan, [5621, 3087]] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, roar_8504] {roar, OpenDOAR}
2 dedup::05128e44e27c36bdba71221bfccf735d [opendoar____::426f990b332ef8193a61cc90516c124... [2318, 5503, 4271] [iława biblioteka cyrfrowa (iława digital libr... [OpenDOAR, roar, roar] [OpenDOAR_2318, roar_5503, roar_4271] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [2318, nan, nan] [{"name": "i\u0142awa biblioteka cyrfrowa (i\u... [[], nan, nan] [http://ibc.ilawa.pl/dlibra, nan, nan] [this site provides access to digitised articl... [governmental, nan, nan] [["pl"], nan, nan] [2019-10-17 14:34:36, nan, nan] [2011-10-11 13:13:58, nan, nan] [[multidisciplinary], nan, nan] [[journal_articles], nan, nan] [[{'name': 'iława', 'alternativeName': '', 'co... [[], nan, nan] [{"name": "dlibra", "version": "4"}, nan, nan] [http://ibc.ilawa.pl/dlibra/oai-pmh-repository... [yes, nan, nan] [OpenDOAR_2318, nan, nan] [nan, 5503, 4271] [nan, 9, 11] [nan, archive, archive] [nan, 8, 8] [nan, nan, nan] [nan, nan, nan] [nan, disk0/00/00/55/03, disk0/00/00/42/71] [nan, 2012-11-19 20:33:30, 2011-10-27 01:25:14] [nan, 2012-11-26 06:53:42, 2011-12-19 07:07:23] [nan, 2012-11-19 20:33:30, 2011-10-27 01:25:14] [nan, other, other] [nan, nan, nan] [nan, nan, nan] [nan, show, show] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, http://ibc.ilawa.pl/dlibra, http://ibc.i... [nan, Iława Biblioteka Cyrfrowa (Iława Digital... [nan, http://ibc.ilawa.pl/dlibra/oai-pmh-repos... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, This site provides access to digitised a... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, Iława, Iława] [nan, http://www.ilawa.pl/_portal, http://www.... [nan, pl, pl] [nan, nan, nan] [nan, 53.596, 53.596] [nan, 19.5684, 19.5684] [nan, nan, nan] [nan, geoname_2_PL, geoname_2_PL] [nan, other, other] [nan, nan, nan] [nan, 2012-07-01 15:13:09, 2009-10-12 10:46:08] [nan, nan, nan] [nan, nan, nan] [nan, 0, 0] [nan, 0, 0] [nan, 0, 0] [nan, 20, 20] [nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, [opendoar, celestial], [opendoar, celest... [nan, [2318, 4672], [2318, 4672]] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, roar_5503, roar_4271] {roar, OpenDOAR}
3 dedup::069059b7ef840f0c74a814ec9237b6ec [5711, 126, opendoar____::1cd3882394520876dc88... [5711, 126, 1509] [Bibioteca Digital Ação Educativa, Biblioteca ... [roar, roar, OpenDOAR] [roar_5711, roar_126, OpenDOAR_1509] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, 1509] [nan, nan, {"name": "bibioteca digital a\u00e7... [nan, nan, []] [nan, nan, http://www.bdae.org.br/dspace/] [nan, nan, this site provides access to the ou... [nan, nan, institutional] [nan, nan, ["pt"]] [nan, nan, 2019-10-17 14:34:23] [nan, nan, 2009-05-01 10:10:47] [nan, nan, [education]] [nan, nan, [theses_and_dissertations, unpub_re... [nan, nan, [{'name': 'ação educativa', 'altern... [nan, nan, []] [nan, nan, {"name": "dspace", "version": ""}] [nan, nan, http://www.bdae.org.br/dspace-oai/r... [nan, nan, yes] [nan, nan, OpenDOAR_1509] [5711, 126, nan] [9, 503, nan] [archive, archive, nan] [8, 1, nan] [nan, nan, nan] [nan, nan, nan] [disk0/00/00/57/11, disk0/00/00/01/26, nan] [2012-12-12 04:37:14, 2010-01-06 13:43:56, nan] [2012-12-17 06:53:38, 2011-07-18 05:42:07, nan] [2012-12-12 04:37:14, 2010-01-06 13:43:56, nan] [institutional, other, nan] [nan, nan, nan] [nan, nan, nan] [show, show, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, 0, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [http://www.bdae.org.br/dspace/, http://www.bd... [Bibioteca Digital Ação Educativa, Biblioteca ... [http://www.bdae.org.br/dspace-oai/request, ht... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [This site provides access to the output of th... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [Ação Educativa, Ação Educativa, nan] [http://www.acaoeducativa.org/, http://www.aca... [br, br, nan] [nan, São Paulo, nan] [-23.5445, -23.5445, nan] [-46.6509, -46.6509, nan] [dspace, dspace, nan] [geoname_2_BR, geoname_2_BR, nan] [other, other, nan] [nan, nan, nan] [2012-07-22 15:12:34, 2008-03-31 20:07:33, nan] [nan, nan, nan] [nan, nan, nan] [0, 0, nan] [0, 0, nan] [0, 0, nan] [100, 100, nan] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,97,100,... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [[opendoar, celestial], [opendoar, celestial],... [[1430, 1509], [1430, 1509], nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [roar_5711, roar_126, nan] {roar, OpenDOAR}
4 dedup::0e139b17a92b2df7d6c3c840e51465fe [4379, 4266, opendoar____::f976b57bb9dd27aa2e7... [4379, 4266, 2306] [Institutional Repository of Ningbo Institute ... [roar, roar, OpenDOAR] [roar_4379, roar_4266, OpenDOAR_2306] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, 2306] [nan, nan, {"name": "institutional repository ... [nan, nan, [{'acronym': 'nimte openir'}]] [nan, nan, http://ir.nimte.ac.cn/] [nan, nan, this site provides access to the ou... [nan, nan, institutional] [nan, nan, ["zh", "en"]] [nan, nan, 2019-10-17 14:34:36] [nan, nan, 2011-10-10 13:13:11] [nan, nan, [technology general, mechanical eng... [nan, nan, [journal_articles, bibliographic_re... [nan, nan, [{'name': 'chinese academy of scien... [nan, nan, []] [nan, nan, {"name": "dspace", "version": ""}] [nan, nan, http://ir.nimte.ac.cn/casirgrid-oai... [nan, nan, yes] [nan, nan, OpenDOAR_2306] [4379, 4266, nan] [15, 11, nan] [archive, archive, nan] [986, 8, nan] [nan, nan, nan] [nan, nan, nan] [disk0/00/00/43/79, disk0/00/00/42/66, nan] [2011-11-09 23:16:22, 2011-10-27 01:26:05, nan] [2011-12-21 15:25:04, 2011-12-19 07:07:21, nan] [2011-11-09 23:16:22, 2011-10-27 01:26:05, nan] [institutional, institutional, nan] [nan, nan, nan] [nan, nan, nan] [show, show, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [http://ir.nimte.ac.cn/, http://ir.nimte.ac.cn... [Institutional Repository of Ningbo Institute ... [http://ir.nimte.ac.cn/casirgrid-oai/request, ... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [This site provides access to the output of th... [TRUE, nan, nan] [TRUE, nan, nan] [FALSE, nan, nan] [Ningbo Institute of Material Technology & Eng... [http://www.nimte.ac.cn/, http://www.cas.cn/, ... [cn, cn, nan] [ningbo, nan, nan] [29.8807, 29.8807, nan] [121.672, 121.672, nan] [dspace, dspace, nan] [geoname_2_CN, geoname_2_CN, nan] [other, other, nan] [nan, nan, nan] [2009-12-21 02:27:07, 2009-12-21 02:27:07, nan] [nan, nan, nan] [nan, nan, nan] [0, 0, nan] [0, 0, nan] [0, 0, nan] [100, 100, nan] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [celestial, [opendoar, celestial], nan] [4668, [4668, 2306], nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [roar_4379, roar_4266, nan] {roar, OpenDOAR}
In [29]:
dup_across = dup_across.groupby('dedup_id').aggregate(list).reset_index()
dup_across['source_set'] = dup_across.source.map(set)
dup_across.head()
<ipython-input-29-7abf9225ca42>:1: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider using pd.concat instead.  To get a de-fragmented frame, use `newframe = frame.copy()`

<ipython-input-29-7abf9225ca42>:2: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider using pd.concat instead.  To get a de-fragmented frame, use `newframe = frame.copy()`

Out[29]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects_phrases OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id source_set
0 dedup::001e6d882e54c780ce269d3c46997287 [re3data_____::4af9fe2bb93511a5e0f0c39e94d6557... [r3d100011306, 2094] [RESID Database of Protein Modifications, RESI... [re3data, FAIRsharing] [re3data_r3d100011306, FAIRsharing_2094] [nan, 2094] [nan, fairsharing-records] [nan, 2014-11-04T15:23:40.000Z] [nan, 2021-09-30T11:38:37.114Z] [nan, 10.25504/FAIRsharing.qaszjp] [nan, RESID Database of Protein Modifications] [nan, ready] [nan, [{'contact-name': 'John S Garavelli', 'c... [nan, http://pir.georgetown.edu/resid/] [nan, 2094.0] [nan, The RESID Database of Protein Modificati... [nan, [{'url': 'http://pir.georgetown.edu/resi... [nan, nan] [nan, [{'url': 'ftp://ftp.pir.georgetown.edu/p... [nan, [biodbcore-000563, bsg-d000563]] [nan, Database] [nan, knowledgebase] [nan, [Life Science]] [nan, [Molecular structure, Small molecule, St... [nan, [All]] [nan, []] [nan, [United Kingdom, European Union, Switzer... [nan, FAIRsharing record for: RESID Database o... [nan, RESID] [nan, https://fairsharing.org/10.25504/FAIRsha... [nan, 10.25504/FAIRsharing.qaszjp] [nan, https://creativecommons.org/licenses/by-... [nan, This FAIRsharing record describes: The R... [nan, [{'id': 334, 'pubmed_id': 12520062, 'tit... [nan, [{'licence-name': 'Open Data Commons (OD... [nan, nan] [nan, RESID] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, FAIRsharing_2094] [r3d100011306, nan] [RESID Database of Protein Modifications, nan] [eng, nan] [[], nan] [https://pir.georgetown.edu/resid/resid.shtml,... [[FAIRsharing_doi:10.25504/FAIRsharing.qaszjp,... [["pirmail@georgetown.edu"], nan] [The RESID Database of Protein Modifications i... [eng, nan] [[disciplinary], nan] [{"size": "", "updatedp": ""}, nan] [2014, nan] [nan, nan] [["eng"], nan] [[2 Life Sciences, 201 Basic Biological and Me... [nan, nan] [[Images, Structured text], nan] [[dataProvider], nan] [[genomes, life sciences, proteins, proteomes,... [[{'institutionName': 'Georgetown University, ... [[{"policyName": "Terms of Use", "policyURL": ... [ {"databaseAccessType": "open", "databaseAcce... [[], nan] [[{"dataAccessType": "open", "dataAccessRestri... [[{"dataLicenseName": "Copyrights", "dataLicen... [closed, nan] [[], nan] [["unknown"], nan] [yes, nan] [["ftp://ftp.pir.georgetown.edu/databases/", "... [["none"], nan] [nan, nan] [[], nan] [yes, nan] [unknown, nan] [[], nan] [[], nan] [{}, nan] [RESID is covered by Thomson Reuters Data Cita... [2014-12-05, nan] [2019-01-17, nan] [re3data_r3d100011306, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] {FAIRsharing, re3data}
1 dedup::0023a1e3447fdb31836536cc903f1310 [opendoar____::c6f798b844366ccd65d99bc7f31e0e0... [3410, 10013] [erucu: electronic repository of the ukrainian... [OpenDOAR, roar] [OpenDOAR_3410, roar_10013] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [3410, nan] [{"name": "erucu: electronic repository of the... [[], nan] [http://er.ucu.edu.ua/, nan] [ukrainian catholic universitys institutional... [institutional, nan] [["uk", "en"], nan] [2019-10-17 14:34:57, nan] [2015-07-08 12:43:38, nan] [[multidisciplinary], nan] [[journal_articles, conference_and_workshop_pa... [[{'name': 'ukrainian catholic university', 'a... [[], nan] [{"name": "dspace", "version": ""}, nan] [nan, nan] [yes, nan] [OpenDOAR_3410, nan] [nan, 10013] [nan, 31] [nan, archive] [nan, 7104] [nan, nan] [nan, nan] [nan, disk0/00/01/00/13] [nan, 2015-08-08 14:53:04] [nan, 2016-03-21 19:54:43] [nan, 2015-08-08 14:53:04] [nan, institutional] [nan, nan] [nan, nan] [nan, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, http://er.ucu.edu.ua/] [nan, ErUCU: Electronic repository of the Ukra... [nan, http://er.ucu.edu.ua/oai/request] [nan, http://er.ucu.edu.ua/sword/] [nan, http://er.ucu.edu.ua/feed/rss_2.0/site] [nan, nan] [nan, Ukrainian Catholic Universitys institut... [nan, TRUE] [nan, TRUE] [nan, TRUE] [nan, Ukrainian Catholic University] [nan, http://ucu.edu.ua/eng/] [nan, ua] [nan, Lviv] [nan, nan] [nan, nan] [nan, dspace] [nan, geoname_2_UA] [nan, other] [nan, [H1, L1, AC, D204, B1, D1, DK, BF, BS, H... [nan, 2015-07-07 12:38:37] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, [opendoar, celestial]] [nan, [3410, 5883]] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, [russell_group, ivy_league]] [nan, roar_10013] {roar, OpenDOAR}
2 dedup::003ab6b40af9b488decea7c582d150a2 [https://fairsharing.org/10.25504/FAIRsharing.... [2315, r3d100011894] [Synapse, Synapse] [FAIRsharing, re3data] [FAIRsharing_2315, re3data_r3d100011894] [2315, nan] [fairsharing-records, nan] [2016-08-02T13:56:30.000Z, nan] [2021-09-30T11:38:43.134Z, nan] [10.25504/FAIRsharing.dnxzmk, nan] [Synapse, nan] [ready, nan] [[{'contact-name': 'Meredith Slota', 'contact-... [https://www.synapse.org/, nan] [2315.0, nan] [Synapse is a collaborative research platform ... [[{'url': 'SynapseInfo@sagebase.org', 'name': ... [2010.0, nan] [[{'url': 'https://www.synapse.org/', 'name': ... [[biodbcore-000791, bsg-d000791], nan] [Database, nan] [repository, nan] [[Biomedical Science, Data Management, Data In... [[Experimental measurement, Protocol, Data sto... [[All], nan] [[], nan] [[United States], nan] [FAIRsharing record for: Synapse, nan] [Synapse, nan] [https://fairsharing.org/10.25504/FAIRsharing.... [10.25504/FAIRsharing.dnxzmk, nan] [https://creativecommons.org/licenses/by-sa/4.... [This FAIRsharing record describes: Synapse is... [[{'id': 2450, 'pubmed_id': 24071850, 'title':... [[{'licence-name': 'Creative Commons Attributi... [nan, nan] [Synapse, nan] [[{'url': 'http://rest-docs.synapse.org/rest/'... [[{'url': 'https://sage-bionetworks.github.io/... [nan, nan] [nan, nan] [nan, nan] [FAIRsharing_2315, nan] [nan, r3d100011894] [nan, Synapse] [nan, eng] [nan, []] [nan, https://www.synapse.org] [nan, [RRID:SCR_006307, RRID:nlx_151983]] [nan, ["synapseinfo@sagebase.org"]] [nan, Synapse is an open source software platf... [nan, eng] [nan, [other]] [nan, {"size": "", "updatedp": ""}] [nan, 2012-05-22] [nan, nan] [nan, ["eng"]] [nan, [2 Life Sciences, 201 Basic Biological a... [nan, https://sagebionetworks.org/tools_resour... [nan, [Raw data, Scientific and statistical da... [nan, [dataProvider, serviceProvider]] [nan, [AMP-AD Knowledge Portal, DREAM Challeng... [nan, [{'institutionName': 'Alfred P. Sloan Fo... [nan, [{"policyName": "Synapse Commons Governa... [nan, {"databaseAccessType": "open", "databas... [nan, []] [nan, [{"dataAccessType": "closed", "dataAcces... [nan, [{"dataLicenseName": "other", "dataLicen... [nan, restricted] [nan, []] [nan, ["unknown"]] [nan, yes] [nan, ["https://docs.synapse.org/rest/", "REST"]] [nan, ["DOI"]] [nan, nan] [nan, []] [nan, yes] [nan, yes] [nan, []] [nan, []] [nan, {}] [nan, nan] [nan, 2015-12-03] [nan, 2021-05-17] [nan, re3data_r3d100011894] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] {FAIRsharing, re3data}
3 dedup::0064f599ed0adb5870a5b3ffe438e485 [16034, opendoar____::d1f157379ea7e51d4a8c07af... [16034, 9647] [Giresun University Institutional Repository, ... [roar, OpenDOAR] [roar_16034, OpenDOAR_9647] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, 9647] [nan, {"name": "giresun university institution... [nan, [{'acronym': 'dspace@giresun'}, {'name':... [nan, http://acikerisim.giresun.edu.tr] [nan, this site provides access to the researc... [nan, institutional] [nan, ["tr"]] [nan, 2021-05-21 18:05:06] [nan, 2020-06-02 09:14:18] [nan, [multidisciplinary]] [nan, [journal_articles]] [nan, [{'name': 'giresun university', 'alterna... [nan, []] [nan, {"name": "dspace", "version": "6.2"}] [nan, http://acikerisim.giresun.edu.tr/oai/req... [nan, yes] [nan, OpenDOAR_9647] [16034, nan] [7, nan] [archive, nan] [12932, nan] [nan, nan] [nan, nan] [disk0/00/01/60/34, nan] [2020-06-01 20:13:50, nan] [2020-06-01 20:14:04, nan] [2020-06-01 20:13:50, nan] [institutional, nan] [nan, nan] [nan, nan] [show, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [https://acikerisim.giresun.edu.tr, nan] [Giresun University Institutional Repository, ... [https://acikerisim.giresun.edu.tr/oai, nan] [nan, nan] [nan, nan] [nan, nan] [DSpace@Giresun is a growing collection of Gir... [TRUE, nan] [TRUE, nan] [TRUE, nan] [Giresun University, nan] [https://www.giresun.edu.tr/, nan] [tr, nan] [Giresun, nan] [40.9147, nan] [38.323, nan] [dspace, nan] [geoname_2_TR, nan] [other, nan] [nan, nan] [2020-05-29 18:13:17, nan] [DSpace@Giresun is a growing collection of Gir... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roarmap, nan] [http://roarmap.eprints.org/1046/, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_16034, nan] {roar, OpenDOAR}
4 dedup::00ac8ed3b4327bdd4ebbebcb2ba10a00 [610, opendoar____::299fb2142d7de959380f91c01c... [610, 1426] [Hedatuz, hedatuz] [roar, OpenDOAR] [roar_610, OpenDOAR_1426] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, 1426] [nan, {"name": "hedatuz", "language": "en"}] [nan, []] [nan, http://hedatuz.euskomedia.org/] [nan, this site contains works published by eu... [nan, disciplinary] [nan, ["eu", "fr", "es", "en"]] [nan, 2019-10-17 14:34:21] [nan, 2009-02-02 13:13:26] [nan, [multidisciplinary]] [nan, [journal_articles, books_chapters_and_se... [nan, [{'name': 'euskomedia', 'alternativeName... [nan, []] [nan, {"name": "eprints", "version": "3.0.5"}] [nan, http://hedatuz.euskomedia.org/cgi/oai2] [nan, yes] [nan, OpenDOAR_1426] [610, nan] [514, nan] [archive, nan] [1, nan] [nan, nan] [nan, nan] [disk0/00/00/06/10, nan] [2010-01-06 13:44:32, nan] [2011-07-18 05:48:34, nan] [2010-01-06 13:44:32, nan] [institutional, nan] [nan, nan] [nan, nan] [show, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [0, nan] [nan, nan] [nan, nan] [nan, nan] [http://hedatuz.euskomedia.org/, nan] [Hedatuz, nan] [http://hedatuz.euskomedia.org/cgi/oai2, nan] [nan, nan] [http://hedatuz.euskomedia.org/cgi/latest_tool... [nan, nan] [Hedatuz, created by the Euskomedia Fundazioa,... [TRUE, nan] [TRUE, nan] [nan, nan] [Euskomedia Fundazioa, nan] [http://www.euskomedia.org, nan] [org, nan] [ (Unknown city), nan] [nan, nan] [nan, nan] [eprints, nan] [geoname_2_ORG, nan] [eprints-3.0.5, nan] [nan, nan] [2008-10-03 15:36:07, nan] [nan, nan] [nan, nan] [0, nan] [0, nan] [0, nan] [100, nan] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,90,90,91,... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [[opendoar, celestial], nan] [[1294, 1426], nan] [nan, nan] [nan, nan] [nan, nan] [570, nan] [331, nan] [519, nan] [145, nan] [806, nan] [nan, nan] [nan, nan] [nan, nan] [roar_610, nan] {roar, OpenDOAR}
In [30]:
def remove_nan(list_obj):
    if isinstance(list_obj, list):
        while np.nan in list_obj:
            list_obj.remove(np.nan)
    return list_obj
    
dup_within.applymap(remove_nan).to_csv('../data/processed/dup_within.csv')
dup_hybrid.applymap(remove_nan).to_csv('../data/processed/dup_hybrid.csv')
dup_across.applymap(remove_nan).to_csv('../data/processed/dup_across.csv')