You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

792 KiB

In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

Loading data from registries

In [2]:
with open('../data/raw/fairsharing_dump_api_02_2022.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df['unique_id'] = 'FAIRsharing_' + fairsharing_df.id
fairsharing_df = fairsharing_df.add_prefix('FAIRsharing_')
fairsharing_df.head()
Out[2]:
FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.metadata.cross-references FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.url-for-logo FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.data-access-condition.type FAIRsharing_attributes.metadata.data-contact-information FAIRsharing_attributes.metadata.data-deposition-condition.url FAIRsharing_attributes.metadata.data-deposition-condition.type FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.data-access-condition.url FAIRsharing_attributes.metadata.resource-sustainability.url FAIRsharing_attributes.metadata.resource-sustainability.name FAIRsharing_attributes.metadata.data-preservation-policy.url FAIRsharing_attributes.metadata.data-preservation-policy.name FAIRsharing_attributes.metadata.data-access-for-pre-publication-review FAIRsharing_attributes.metadata.data-versioning FAIRsharing_attributes.metadata.data-curation.type FAIRsharing_attributes.metadata.data-curation.url FAIRsharing_attributes.metadata.citation-to-related-publications FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id
0 3226 fairsharing-records 2020-12-09T11:53:44.000Z 2022-02-08T10:42:36.452Z 10.25504/FAIRsharing.d6423b WDC Sunspot Index and Long-term Solar Observat... ready [{'contact-name': 'Frédéric Clette', 'contact-... http://sidc.be/silso/home 3226 The WDC-SILSO is an activity of the Operationa... WDC-SILSO [{'url': 'http://www.sidc.be/silso/taxonomy/te... 2013.0 [{'url': 'http://www.sidc.be/silso/datafiles',... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-001740, bsg-d001740] Database repository [Electromagnetism, Astrophysics and Astronomy,... [Climate, Observation design] [Not applicable] [Climate change, earth observation, Electromag... [Belgium] FAIRsharing record for: WDC Sunspot Index and ... WDC-SILSO https://fairsharing.org/10.25504/FAIRsharing.d... 10.25504/FAIRsharing.d6423b https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The WDC-SIL... [] [{'licence-name': 'SILSO legal notices', 'lice... None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN FAIRsharing_3226
1 2114 fairsharing-records 2014-11-04T15:23:40.000Z 2022-01-21T14:39:02.195Z 10.25504/FAIRsharing.p06nme Biological Magnetic Resonance Data Bank ready [{'contact-name': 'Helpdesk', 'contact-email':... https://bmrb.io/ 2114 BMRB collects, annotates, archives, and dissem... BMRB [{'url': 'https://bmrb.io/bmrb/news/', 'name':... 1988.0 [{'url': 'https://bmrb.io/data_library/rsync.s... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-000584, bsg-d000584] Database repository [Structural Biology] [Molecular structure, Protein structure, Pepti... [All] [] [United States] FAIRsharing record for: Biological Magnetic Re... BMRB https://fairsharing.org/10.25504/FAIRsharing.p... 10.25504/FAIRsharing.p06nme https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: BMRB collec... [{'id': 552, 'pubmed_id': 18288446, 'title': '... [{'licence-name': 'wwPDB Privacy and Usage Pol... None [{'doi': '10.1093/nar/gkm957', 'pubmed-id': 17... [{'url': 'https://bmrb.io/validate/', 'name': ... open yes https://bmrb.io/deposit/ open NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN FAIRsharing_2114
2 3022 fairsharing-records 2020-06-17T10:25:30.000Z 2022-02-08T10:41:04.073Z 10.25504/FAIRsharing.8b7a2f Fisheries and Oceans Canada Pacific Region Dat... ready [{'contact-name': 'Peter Chandler', 'contact-e... http://www.pac.dfo-mpo.gc.ca/science/oceans/da... 3022 The Institute of Ocean Sciences (IOS)/Ocean Sc... None [{'url': 'DFO.PAC.SCI.IOSData-DonneesISO.SCI.P... NaN [{'name': 'Users must contact the Senior Analy... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-001530, bsg-d001530] Database repository [Environmental Science, Meteorology, Earth Sci... [Climate] [Not applicable] [Salinity, Temperature] [Canada] FAIRsharing record for: Fisheries and Oceans C... None https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8b7a2f https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The Institu... [] [{'licence-name': 'Fisheries and Oceans Canada... None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN FAIRsharing_3022
3 2998 fairsharing-records 2020-05-21T07:42:30.000Z 2022-02-08T10:40:19.531Z 10.25504/FAIRsharing.e08886 Climate Prediction Center ready [{'contact-name': 'Jon Hoopingarner', 'contact... https://www.cpc.ncep.noaa.gov/ 2998 The Climate Prediction Center (CPC) produces o... CPC [{'url': 'https://www.cpc.ncep.noaa.gov/commen... 1970.0 [{'url': 'https://www.cpc.ncep.noaa.gov/', 'na... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-001504, bsg-d001504] Database repository [Hydrogeology, Geography, Meteorology, Geodesy... [Climate] [Not applicable] [Forecasting, weather] [United States] FAIRsharing record for: Climate Prediction Center CPC https://fairsharing.org/10.25504/FAIRsharing.e... 10.25504/FAIRsharing.e08886 https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The Climate... [] [{'licence-name': 'National Weather Service Di... None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN FAIRsharing_2998
4 2301 fairsharing-records 2016-06-03T14:54:08.000Z 2021-11-24T13:17:51.201Z 10.25504/FAIRsharing.meh9wz Acytostelium Gene Database deprecated [{'contact-name': 'Acytostelium genome consort... http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... 2301 Genome and transcriptome database of Acytostel... NaN NaN 2008.0 NaN NaN [biodbcore-000775, bsg-d000775] Database repository [Genomics, Life Science, Transcriptomics] [DNA sequence data, Gene model annotation] [Acytostelium subglobosum] [] [United Kingdom, Japan] FAIRsharing record for: Acytostelium Gene Data... None https://fairsharing.org/10.25504/FAIRsharing.m... 10.25504/FAIRsharing.meh9wz https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: Genome and ... [{'id': 1139, 'pubmed_id': 25758444, 'title': ... [] None NaN NaN This resource is no longer available at the st... NaN NaN NaN NaN 2021-9-17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN FAIRsharing_2301
In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df['unique_id'] = 're3data_' + re3data_df.orgIdentifier
re3data_df = re3data_df.add_prefix('re3data_')
re3data_df.head()
Out[3]:
re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id
0 r3d100000001 Odum Institute Archive Dataverse eng [] https://dataverse.unc.edu/dataverse/odum [] ["https://dataverse.unc.edu/dataverse/odum#", ... The Odum Institute Archive Dataverse contains ... eng [disciplinary] {"size": "13 dataverses; 3.050 datasets", "upd... NaN NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... NaN [{'name': 'Databases', 'scheme': 'parse'}, {'n... [dataProvider] [FAIR, Middle East, crime, demography, economy... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC0", "databaseLicen... [{"dataAccessType": "embargoed", "dataAccessRe... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["DataVerse"] NaN {} ["DOI"] NaN [] unknown yes ["other"] [{"metadataStandardName": "DDI - Data Document... {} Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06 re3data_r3d100000001
1 r3d100000002 Access to Archival Databases eng [{'additionalName': 'AAD', 'additionalNameLang... https://aad.archives.gov/aad/ [RRID:SCR_010479, RRID:nlx_157752] ["https://www.archives.gov/contact"] You will find in the Access to Archival Databa... eng [disciplinary] {"size": "", "updatedp": ""} 1985 NaN ["eng", "spa"] [{'name': '1 Humanities and Social Sciences', ... https://www.archives.gov/publications/general-... [{'name': 'Images', 'scheme': 'parse'}, {'name... [dataProvider] [US History] [{'institutionName': 'The U.S. National Archiv... [{"policyName": "Contribution Policy", "policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "Copyrights", "dataLicens... restricted [] ["unknown"] no {"api": "https://www.archives.gov/developer#to... ["none"] https://aad.archives.gov/aad/help/getting-star... [] unknown unknown [] [] {"syndication": "http://www.archives.gov/socia... NaN 2012-07-04 2021-05-25 re3data_r3d100000002
2 r3d100000004 Datenbank Gesprochenes Deutsch deu [{'additionalName': 'DGD', 'additionalNameLang... https://dgd.ids-mannheim.de/ [] ["dgd@ids-mannheim.de"] The "Database for Spoken German (DGD)" is a co... eng [disciplinary] {"size": "34 corpora", "updatedp": "2020-02-03"} 2012 NaN ["deu"] [{'name': '1 Humanities and Social Sciences', ... https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... [{'name': 'Audiovisual data', 'scheme': 'parse... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [{'institutionName': 'Institut für Deutsche Sp... [{"policyName": "Erfurter Aufruf zur Sicherung... {"databaseAccessType": "restricted", "databas... [] [{"dataAccessType": "restricted", "dataAccessR... [{"dataLicenseName": "other", "dataLicenseURL"... restricted [] ["other"] yes {} ["none"] http://agd.ids-mannheim.de/konditionen.shtml [] unknown unknown ["RatSWD"] [] {} NaN 2012-07-20 2020-08-27 re3data_r3d100000004
3 r3d100000005 UNC Dataverse eng [{'additionalName': 'University of North Carol... https://dataverse.unc.edu/ [FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c] ["https://dataverse.unc.edu/", "odumarchive@un... UNC Dataverse is an open-source repository sof... eng [institutional] {"size": "186 dataverses; 25.272 studies; 229.... 2011 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://odum.unc.edu/about/mission-vision/ [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Data Deposit Form"... ["DataVerse"] yes {"api": "https://guides.dataverse.org/en/lates... ["ARK", "DOI", "PURL", "URN", "hdl"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [{"metadataStandardName": "DDI - Data Document... {} UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-10-25 re3data_r3d100000005
4 r3d100000006 Archaeology Data Service eng [{'additionalName': 'ADS', 'additionalNameLang... https://archaeologydataservice.ac.uk/ [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] ["help@archaeologydataservice.ac.uk", "https:/... The ADS is an accredited digital repository fo... eng [disciplinary] {"size": "1837 results", "updatedp": "2020-05-... 1996-10-01 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://archaeologydataservice.ac.uk/about/our... [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [{'institutionName': 'Arts and Humanities Rese... [{"policyName": "ADS Guides to good practice",... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC", "databaseLicens... [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Guidelines for Dep... ["other"] yes {"api": "https://archaeologydataservice.ac.uk/... ["DOI"] https://archaeologydataservice.ac.uk/advice/te... [] unknown yes ["other"] [{"metadataStandardName": "DataCite Metadata S... {"syndication": "https://archaeologydataservic... ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02 re3data_r3d100000006
In [4]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df['unique_id'] = 'OpenDOAR_' + opendoar_df['system_metadata.id']
opendoar_df = opendoar_df.add_prefix('OpenDOAR_')
opendoar_df.head()
Out[4]:
OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_repository_metadata.repository_status OpenDOAR_repository_metadata.fulltext_record_count OpenDOAR_repository_metadata.metadata_record_count OpenDOAR_unique_id
0 134 {"name": "eldorado - repository of the tu dort... [{'name': 'eldorado - ressourcen aus und für l... https://eldorado.tu-dortmund.de NaN institutional [] 2022-01-12 15:34:54 2005-12-19 14:57:52 [arts, humanities, science, mathematics, socia... [journal_articles, conference_and_workshop_pap... [{'name': 'technische universität dortmund', '... [] {"name": "dspace", "version": ""} https://eldorado.tu-dortmund.de/oai/request yes NaN 9629.0 20963.0 OpenDOAR_134
1 58 {"name": "archive ouverte en sciences de linfo... [{'acronym': '@rchivesic'}] https://archivesic.ccsd.cnrs.fr NaN institutional [] 2022-01-12 15:34:53 2006-01-13 12:48:32 [arts, science, technology, engineering, mathe... [journal_articles, conference_and_workshop_pap... [{'name': 'centre pour la communication scient... [] {"name": "hal", "version": ""} https://api.archives-ouvertes.fr/oai/archivesic yes NaN 55492.0 1137498.0 OpenDOAR_58
2 93 {"name": "digitalcommons@the texas medical cen... [] http://digitalcommons.library.tmc.edu/ NaN institutional [] 2022-01-12 15:34:53 2006-02-14 11:16:12 [health and medicine] [journal_articles, theses_and_dissertations] [{'name': 'texas medical center', 'alternative... [] {"name": "other", "version": ""} http://digitalcommons.library.tmc.edu/do/oai/ yes NaN 2658.0 7268.0 OpenDOAR_93
3 68 {"name": "cognitive sciences eprint archive", ... [{'acronym': 'cogprints'}] http://cogprints.org/ NaN disciplinary [] 2022-01-12 15:34:53 2006-01-04 15:01:23 [humanities, health and medicine, science, soc... [journal_articles, conference_and_workshop_pap... [{'name': 'university of southampton', 'altern... [] {"name": "eprints", "version": ""} http://cogprints.org/cgi/oai2 yes NaN 2895.0 4277.0 OpenDOAR_68
4 84 {"name": "digital commons@carleton college", "... [] http://digitalcommons.carleton.edu/ NaN institutional [] 2022-01-12 15:34:53 2006-01-04 16:07:58 [humanities, science, social sciences] [journal_articles, unpub_reports_and_working_p... [{'name': 'carleton college', 'alternativeName... [] {"name": "other", "version": ""} NaN yes NaN NaN 42.0 OpenDOAR_84
In [5]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)

def value_or_list(cell_set):
    copy = set(cell_set)
    copy.discard(np.nan) 
    if len(copy) == 0:
        return np.nan
    if len(copy) == 1:
        return copy.pop()
    return list(copy)
        
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)

roar_df['unique_id'] = 'roar_' + roar_df.eprintid
roar_df = roar_df.add_prefix('roar_')
roar_df.head()
Out[5]:
roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
0 1 633 archive 1 NaN NaN disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-18 05:40:07 2010-01-06 13:43:48 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://archivesic.ccsd.cnrs.fr/ @RCHIVESIC http://archivesic.ccsd.cnrs.fr/oai/oai.php NaN NaN NaN NaN NaN NaN NaN NaN NaN fr NaN NaN NaN hal geoname_2_FR other NaN 2002-05-17 19:24:41 NaN NaN 0 0 0 25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... NaN NaN NaN NaN [celestial, opendoar] [58, 669] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_1
1 10 511 archive 1 NaN NaN disk0/00/00/00/10 2010-01-06 13:43:48 2011-07-18 05:40:13 2010-01-06 13:43:48 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://www.diva-portal.org/mdh/ Academic Archive On-line (Mälardalen Universit... http://www.diva-portal.org/oai/mdh/OAI NaN NaN NaN NaN TRUE TRUE NaN NaN NaN se Uppsala 59.8667 17.6333 diva geoname_2_SE other NaN 2005-12-08 13:15:22 NaN NaN 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,100... NaN NaN NaN NaN [celestial, opendoar] [526, 258] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_10
2 1000 274 archive 1 NaN NaN disk0/00/00/10/00 2010-01-06 13:45:01 2011-07-06 08:21:21 2010-01-06 13:45:01 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://pam.pisharp.org/ PAM - Portuguese Archive of Mathematics NaN NaN NaN NaN NaN TRUE TRUE NaN NaN NaN pt Bellevue, WA 47.6034 -122.155 dspace geoname_2_PT other NaN 2006-05-04 10:48:14 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_1000
3 10001 20 archive 91 NaN NaN disk0/00/01/00/01 2015-08-08 14:52:11 2016-03-21 19:44:01 2015-08-08 14:52:11 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://edoc.sub.uni-hamburg.de/klimawandel/ Klimawandel Dokumentenserver http://edoc.sub.uni-hamburg.de/klimawandel/oai NaN NaN NaN The "Documentenserver Klimawandel" (Repository... TRUE TRUE TRUE [Climate Service Center 2.0, Helmholtz-Zentrum... [http://www.klimzug.de/de/94.php, http://www.c... de Hamburg 53.5511 9.9937 opus geoname_2_DE other [HD, S1, GF, GE, G1] 2015-07-02 08:08:31 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [celestial, opendoar] [3408, 5881] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_10001
4 10008 11 archive 404 NaN NaN disk0/00/01/00/08 2015-08-08 14:52:26 2016-03-21 19:43:51 2015-08-08 14:52:26 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://creativematter.skidmore.edu/ Creative Matter | Skidmore College Research http://creativematter.skidmore.edu/do/oai/ NaN http://creativematter.skidmore.edu/recent.rss NaN Welcome to Creative Matter, a repository for t... TRUE FALSE FALSE Skidmore College http://www.skidmore.edu/ us Saratoga Springs 43.0961 -73.7818 bepress geoname_2_US other NaN 2015-07-06 17:35:50 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN celestial 5882 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_10008
In [6]:
roar_df[roar_df.roar_eprintid == '10013']
Out[6]:
roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
7 10013 31 archive 7104 NaN NaN disk0/00/01/00/13 2015-08-08 14:53:04 2016-03-21 19:54:43 2015-08-08 14:53:04 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://er.ucu.edu.ua/ ErUCU: Electronic repository of the Ukrainian ... http://er.ucu.edu.ua/oai/request http://er.ucu.edu.ua/sword/ http://er.ucu.edu.ua/feed/rss_2.0/site NaN Ukrainian Catholic Universitys institutional ... TRUE TRUE TRUE Ukrainian Catholic University http://ucu.edu.ua/eng/ ua Lviv NaN NaN dspace geoname_2_UA other [D1, DK, BL, BR, L1, BS, D901, B1, AC, BF, HM,... 2015-07-07 12:38:37 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [celestial, opendoar] [5883, 3410] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [russell_group, ivy_league] roar_10013

Loading dedup results

In [7]:
dup = pd.read_csv('../data/processed/ds_dedup_2022-02-16_13.03.17.csv',
                  sep=';', quotechar='"',
                  header=0, names=['dedup_id', 'duplicate_id', 'original_id', 'name', 'source'])
dup['unique_id'] = dup.source + '_' + dup.original_id
dup.head()
Out[7]:
dedup_id duplicate_id original_id name source unique_id
0 dedup::001e6d882e54c780ce269d3c46997287 https://fairsharing.org/10.25504/FAIRsharing.q... 2094 RESID Database of Protein Modifications FAIRsharing FAIRsharing_2094
1 dedup::001e6d882e54c780ce269d3c46997287 re3data::r3d100011306 r3d100011306 RESID Database of Protein Modifications re3data re3data_r3d100011306
2 dedup::003ab6b40af9b488decea7c582d150a2 re3data::r3d100011894 r3d100011894 Synapse re3data re3data_r3d100011894
3 dedup::003ab6b40af9b488decea7c582d150a2 https://fairsharing.org/10.25504/FAIRsharing.d... 2315 Synapse FAIRsharing FAIRsharing_2315
4 dedup::0048f2e3aa55ab88aaaac0cfa4153ad5 opendoar::4562 4562 erzincan binali yıldırım university institutio... OpenDOAR OpenDOAR_4562
In [8]:
dup.describe()
Out[8]:
dedup_id duplicate_id original_id name source unique_id
count 4712 4712 4712 4712 4712 4712
unique 2239 4712 4238 4017 4 4712
top dedup::67c12a6c3288a49f1db6a2343ec599ca https://fairsharing.org/10.25504/FAIRsharing.q... 3284 UPN JATIM REPOSITORY roar FAIRsharing_2094
freq 5 1 3 4 1981 1

Assessing duplicates distribution across registries

In [9]:
dup_grouped = dup.groupby('dedup_id').aggregate(list)
dup_grouped['source_set'] = dup_grouped.source.map(set)
In [10]:
dup_grouped[dup_grouped.source_set.str.len() == 4].count()
Out[10]:
duplicate_id    6
original_id     6
name            6
source          6
unique_id       6
source_set      6
dtype: int64
In [11]:
dup_grouped[dup_grouped.source_set.str.len() == 3].count()
Out[11]:
duplicate_id    61
original_id     61
name            61
source          61
unique_id       61
source_set      61
dtype: int64
In [12]:
dup_grouped[dup_grouped.source_set.str.len() == 2].count()
Out[12]:
duplicate_id    2029
original_id     2029
name            2029
source          2029
unique_id       2029
source_set      2029
dtype: int64
In [13]:
dup_grouped[dup_grouped.source_set.str.len() == 1].count()
Out[13]:
duplicate_id    143
original_id     143
name            143
source          143
unique_id       143
source_set      143
dtype: int64

Assessing duplicates within registries

In [14]:
roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()
roar_dup = roar_dup[roar_dup.duplicate_id > 1]
roar_dup.aggregate(['count', 'sum'])
Out[14]:
duplicate_id original_id name source unique_id
count 249 249 249 249 249
sum 518 518 518 518 518
In [15]:
opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()
opendoar_dup = opendoar_dup[opendoar_dup.duplicate_id > 1]
opendoar_dup.aggregate(['count', 'sum'])
Out[15]:
duplicate_id original_id name source unique_id
count 30 30 30 30 30
sum 62 62 62 62 62
In [16]:
re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()
re3data_dup = re3data_dup[re3data_dup.duplicate_id > 1]
re3data_dup.aggregate(['count', 'sum'])
Out[16]:
duplicate_id original_id name source unique_id
count 3 3 3 3 3
sum 6 6 6 6 6
In [17]:
fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()
fairsharing_dup = fairsharing_dup[fairsharing_dup.duplicate_id > 1]
fairsharing_dup.aggregate(['count', 'sum'])
Out[17]:
duplicate_id original_id name source unique_id
count 0 0 0 0 0
sum 0 0 0 0 0

There is one record which is counted twice. All ok: 2 records in roar and 2 in opendoar ['OpenDOAR_5226', 'roar_14929', 'OpenDOAR_3820', 'roar_16263']

In [18]:
np.intersect1d(roar_dup.index, opendoar_dup.index)
Out[18]:
array(['dedup::6973375bbb56846f0d935bd1cd9e0b98'], dtype=object)
In [19]:
dup[dup.dedup_id == 'dedup::6973375bbb56846f0d935bd1cd9e0b98']
Out[19]:
dedup_id duplicate_id original_id name source unique_id
1937 dedup::6973375bbb56846f0d935bd1cd9e0b98 opendoar::3820 3820 repositorio - universidad de la costa OpenDOAR OpenDOAR_3820
1938 dedup::6973375bbb56846f0d935bd1cd9e0b98 opendoar::5226 5226 repositorio universidad de la costa OpenDOAR OpenDOAR_5226
1939 dedup::6973375bbb56846f0d935bd1cd9e0b98 roar::14929 14929 Repositorio Universidad de la Costa roar roar_14929
1940 dedup::6973375bbb56846f0d935bd1cd9e0b98 roar::16263 16263 Repositorio Universidad de la Costa roar roar_16263

Isolating single-registry duplicates

In [20]:
dup_within = dup.groupby('dedup_id').aggregate(list)
dup_within['source_set'] = dup_within.source.map(set)
dup_within = dup_within[dup_within.source_set.str.len() == 1]
dup_within.head()
Out[20]:
duplicate_id original_id name source unique_id source_set
dedup_id
dedup::07b65089515c8f99812d14bbb01334a6 [roar::474, roar::5541] [474, 5541] [ECNIS Repository (Environmental Cancer Risk, ... [roar, roar] [roar_474, roar_5541] {roar}
dedup::0be44aa69610e09805d4002baf7e0b10 [roar::16867, roar::2907] [16867, 2907] [Chung Shan Medical University Institutional R... [roar, roar] [roar_16867, roar_2907] {roar}
dedup::0c34770edc42a1d2ac361b64cfabfb63 [roar::5432, roar::4030] [5432, 4030] [Digital Library of Jelenia Góra, Digital Libr... [roar, roar] [roar_5432, roar_4030] {roar}
dedup::0c6ed4b110c461d9350bf5c620bc78d7 [roar::3020, roar::3401, roar::5252] [3020, 3401, 5252] [KCE Repository, KCE Repository, KCE Repository] [roar, roar, roar] [roar_3020, roar_3401, roar_5252] {roar}
dedup::0e3c63baca694032044bbb00c2f1111e [roar::8405, roar::8716] [8405, 8716] [Content Pro IRX, Content Pro IRX] [roar, roar] [roar_8405, roar_8716] {roar}
In [21]:
dup_within['source_set'] = dup_within.source_set.map(set.pop)
dup_within.head()
Out[21]:
duplicate_id original_id name source unique_id source_set
dedup_id
dedup::07b65089515c8f99812d14bbb01334a6 [roar::474, roar::5541] [474, 5541] [ECNIS Repository (Environmental Cancer Risk, ... [roar, roar] [roar_474, roar_5541] roar
dedup::0be44aa69610e09805d4002baf7e0b10 [roar::16867, roar::2907] [16867, 2907] [Chung Shan Medical University Institutional R... [roar, roar] [roar_16867, roar_2907] roar
dedup::0c34770edc42a1d2ac361b64cfabfb63 [roar::5432, roar::4030] [5432, 4030] [Digital Library of Jelenia Góra, Digital Libr... [roar, roar] [roar_5432, roar_4030] roar
dedup::0c6ed4b110c461d9350bf5c620bc78d7 [roar::3020, roar::3401, roar::5252] [3020, 3401, 5252] [KCE Repository, KCE Repository, KCE Repository] [roar, roar, roar] [roar_3020, roar_3401, roar_5252] roar
dedup::0e3c63baca694032044bbb00c2f1111e [roar::8405, roar::8716] [8405, 8716] [Content Pro IRX, Content Pro IRX] [roar, roar] [roar_8405, roar_8716] roar
In [22]:
dup_within.groupby('dedup_id').ngroups
Out[22]:
143
In [23]:
dup_within.groupby('source_set').count()
Out[23]:
duplicate_id original_id name source unique_id
source_set
OpenDOAR 18 18 18 18 18
re3data 2 2 2 2 2
roar 123 123 123 123 123
In [24]:
dup_within = dup[dup.dedup_id.isin(dup_within.index)]
dup_within
Out[24]:
dedup_id duplicate_id original_id name source unique_id
122 dedup::07b65089515c8f99812d14bbb01334a6 roar::474 474 ECNIS Repository (Environmental Cancer Risk roar roar_474
123 dedup::07b65089515c8f99812d14bbb01334a6 roar::5541 5541 ECNIS Repository (Environmental Cancer Risk roar roar_5541
184 dedup::0be44aa69610e09805d4002baf7e0b10 roar::16867 16867 Chung Shan Medical University Institutional Re... roar roar_16867
185 dedup::0be44aa69610e09805d4002baf7e0b10 roar::2907 2907 Chung Shan Medical University Institutional Re... roar roar_2907
192 dedup::0c34770edc42a1d2ac361b64cfabfb63 roar::5432 5432 Digital Library of Jelenia Góra roar roar_5432
... ... ... ... ... ... ...
4583 dedup::f9293f212c2f13c7cc7a2d2a967ac7d5 roar::13134 13134 Repositorio Universidad de Sucre roar roar_13134
4608 dedup::fab2415bf42ac76e4ae00aa68b61a4ba roar::5482 5482 Biblioteca Virtual del Centro de Documentación roar roar_5482
4609 dedup::fab2415bf42ac76e4ae00aa68b61a4ba roar::5214 5214 Biblioteca Virtual del Centro de Documentación roar roar_5214
4690 dedup::fee4180dcb5f2af4d963b6d74d82d8c2 roar::3992 3992 York St John University ArchivalWare Digital L... roar roar_3992
4691 dedup::fee4180dcb5f2af4d963b6d74d82d8c2 roar::5185 5185 York St John University ArchivalWare Digital L... roar roar_5185

296 rows × 6 columns

Isolating hybrid duplicates

In [25]:
dup_across = dup[~dup.dedup_id.isin(dup_within.dedup_id)]
dup_across = dup_across.groupby('dedup_id').aggregate(list)
dup_across['source_set'] = dup_across.source.map(set)

dup_hybrid = dup_across[dup_across.source_set.str.len() < dup_across.source.str.len()]
dup_hybrid = dup[dup.dedup_id.isin(dup_hybrid.index)]
dup_hybrid
Out[25]:
dedup_id duplicate_id original_id name source unique_id
53 dedup::038ef33e8d3de51d3536d62e6c103be7 roar::6167 6167 Institutional Repository UIN Syarif Hidayatull... roar roar_6167
54 dedup::038ef33e8d3de51d3536d62e6c103be7 opendoar::2717 2717 institutional repository uin syarif hidayatull... OpenDOAR OpenDOAR_2717
55 dedup::038ef33e8d3de51d3536d62e6c103be7 roar::6580 6580 Institutional Repository UIN Syarif Hidayatull... roar roar_6580
72 dedup::044edcd1c961b3942a7e0e90d1005e2d roar::7902 7902 The University of Arizona Campus Repository roar roar_7902
73 dedup::044edcd1c961b3942a7e0e90d1005e2d opendoar::2468 2468 university of arizona campus repository OpenDOAR OpenDOAR_2468
... ... ... ... ... ... ...
4596 dedup::fa0721f07402e0593da77a46fa687da6 opendoar::2545 2545 sanok digital library OpenDOAR OpenDOAR_2545
4597 dedup::fa0721f07402e0593da77a46fa687da6 roar::5746 5746 Sanok Digital Library roar roar_5746
4610 dedup::fab888b1713fb886b13bbd2d569bba60 opendoar::2539 2539 publication server of the wuppertal institute OpenDOAR OpenDOAR_2539
4611 dedup::fab888b1713fb886b13bbd2d569bba60 roar::11212 11212 Publication Server of the Wuppertal Institute roar roar_11212
4612 dedup::fab888b1713fb886b13bbd2d569bba60 roar::5891 5891 Publication Server of the Wuppertal Institute roar roar_5891

434 rows × 6 columns

In [26]:
dup_hybrid.groupby('dedup_id').ngroups
Out[26]:
138

Isolating multiple-registry duplicates

In [27]:
dup_across = dup_across[dup_across.source_set.str.len() == dup_across.source.str.len()]
dup_across = dup[dup.dedup_id.isin(dup_across.index)]
dup_across
# dup[dup.dedup_id.isin(dup_across.index)]
Out[27]:
dedup_id duplicate_id original_id name source unique_id
0 dedup::001e6d882e54c780ce269d3c46997287 https://fairsharing.org/10.25504/FAIRsharing.q... 2094 RESID Database of Protein Modifications FAIRsharing FAIRsharing_2094
1 dedup::001e6d882e54c780ce269d3c46997287 re3data::r3d100011306 r3d100011306 RESID Database of Protein Modifications re3data re3data_r3d100011306
2 dedup::003ab6b40af9b488decea7c582d150a2 re3data::r3d100011894 r3d100011894 Synapse re3data re3data_r3d100011894
3 dedup::003ab6b40af9b488decea7c582d150a2 https://fairsharing.org/10.25504/FAIRsharing.d... 2315 Synapse FAIRsharing FAIRsharing_2315
4 dedup::0048f2e3aa55ab88aaaac0cfa4153ad5 opendoar::4562 4562 erzincan binali yıldırım university institutio... OpenDOAR OpenDOAR_4562
... ... ... ... ... ... ...
4707 dedup::ff7d2ea87cebddb182db2fb8cf32aa89 opendoar::2126 2126 sophia OpenDOAR OpenDOAR_2126
4708 dedup::ffb342887a73ec0ead022e0414d765b1 roar::668 668 Infoscience: École polytechnique fédérale de L... roar roar_668
4709 dedup::ffb342887a73ec0ead022e0414d765b1 opendoar::185 185 infoscience - école polytechnique fédérale de ... OpenDOAR OpenDOAR_185
4710 dedup::ffbb6800107747f9224cdde0df95da7c opendoar::3122 3122 istanbul bilgi university library open access OpenDOAR OpenDOAR_3122
4711 dedup::ffbb6800107747f9224cdde0df95da7c roar::13646 13646 Istanbul Bilgi University Library Open Access roar roar_13646

3982 rows × 6 columns

In [28]:
dup_across.groupby('dedup_id').ngroups
Out[28]:
1958

Double check partitions

In [29]:
dup.count()
Out[29]:
dedup_id        4712
duplicate_id    4712
original_id     4712
name            4712
source          4712
unique_id       4712
dtype: int64
In [30]:
dup_across.count() + dup_within.count() + dup_hybrid.count()
Out[30]:
dedup_id        4712
duplicate_id    4712
original_id     4712
name            4712
source          4712
unique_id       4712
dtype: int64
In [31]:
dup_within.groupby('dedup_id').ngroups + dup_across.groupby('dedup_id').ngroups + dup_hybrid.groupby('dedup_id').ngroups
Out[31]:
2239
In [32]:
dup.groupby('dedup_id').ngroups
Out[32]:
2239

Joining information

In [33]:
dup_within = dup_within.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_within = dup_within.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_within = dup_within.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_within = dup_within.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_within.head()
Out[33]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.metadata.cross-references FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.url-for-logo FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.data-access-condition.type FAIRsharing_attributes.metadata.data-contact-information FAIRsharing_attributes.metadata.data-deposition-condition.url FAIRsharing_attributes.metadata.data-deposition-condition.type FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.data-access-condition.url FAIRsharing_attributes.metadata.resource-sustainability.url FAIRsharing_attributes.metadata.resource-sustainability.name FAIRsharing_attributes.metadata.data-preservation-policy.url FAIRsharing_attributes.metadata.data-preservation-policy.name FAIRsharing_attributes.metadata.data-access-for-pre-publication-review FAIRsharing_attributes.metadata.data-versioning FAIRsharing_attributes.metadata.data-curation.type FAIRsharing_attributes.metadata.data-curation.url FAIRsharing_attributes.metadata.citation-to-related-publications FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_repository_metadata.repository_status OpenDOAR_repository_metadata.fulltext_record_count OpenDOAR_repository_metadata.metadata_record_count OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
0 dedup::07b65089515c8f99812d14bbb01334a6 roar::474 474 ECNIS Repository (Environmental Cancer Risk roar roar_474 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 474 281 archive 1 NaN NaN disk0/00/00/04/74 2010-01-06 13:44:22 2011-07-06 08:19:53 2010-01-06 13:44:22 other NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://ecnis.openrepository.com/ ECNIS Repository (Environmental Cancer Risk, N... NaN NaN NaN NaN This site is a subject specific repository con... TRUE TRUE NaN ECNIS (Environmental Cancer Risk, Nutrition an... http://www.ecnis.org pl Lodz 51.8 19.5 openrepo geoname_2_PL other NaN 2008-06-03 08:05:43 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN opendoar 1254 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_474
1 dedup::07b65089515c8f99812d14bbb01334a6 roar::5541 5541 ECNIS Repository (Environmental Cancer Risk roar roar_5541 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5541 8 archive 8 NaN NaN disk0/00/00/55/41 2012-12-12 01:21:03 2012-12-15 02:51:35 2012-12-12 01:21:03 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://ecnis.openrepository.com/ecnis/ ECNIS Repository (Environmental Cancer Risk, N... NaN NaN NaN NaN This site is a subject specific repository con... NaN NaN NaN ECNIS Network of Excellence http://www.ecnis.org/ pl NaN 51.8 19.5 NaN geoname_2_PL other NaN 2012-07-01 15:13:36 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN opendoar 1254 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_5541
2 dedup::0be44aa69610e09805d4002baf7e0b10 roar::16867 16867 Chung Shan Medical University Institutional Re... roar roar_16867 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16867 3 archive 360 NaN NaN disk0/00/01/68/67 2021-02-25 13:06:19 2021-02-25 13:06:19 2021-02-25 13:06:19 institutional 2907 NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN https://ir.csmu.edu.tw:8080 Chung Shan Medical University Institutional Re... https://ir.csmu.edu.tw:8080/ir-oai/request?ver... NaN NaN NaN NaN TRUE TRUE FALSE NaN NaN NaN NaN NaN NaN dspace NaN other [RT, RC0254, RC1200, R1, RK] 2009-10-21 00:00:00 NaN NaN 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28... NaN NaN NaN NaN NaN NaN celestial NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_16867
3 dedup::0be44aa69610e09805d4002baf7e0b10 roar::2907 2907 Chung Shan Medical University Institutional Re... roar roar_2907 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2907 548 archive 360 NaN NaN disk0/00/00/29/07 2010-07-29 01:40:55 2021-02-17 06:33:34 2010-07-29 01:40:55 institutional NaN NaN no_search NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN https://ir.csmu.edu.tw:8080 Chung Shan Medical University Institutional Re... https://ir.csmu.edu.tw:8080/ir-oai/request?ver... NaN NaN NaN NaN TRUE TRUE FALSE NaN NaN NaN NaN NaN NaN dspace NaN other [RC0321, RT, RC0254, RC1200, R1, RK] 2009-10-21 00:00:00 NaN NaN 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28... NaN NaN NaN NaN NaN NaN celestial NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_2907
4 dedup::0c34770edc42a1d2ac361b64cfabfb63 roar::5432 5432 Digital Library of Jelenia Góra roar roar_5432 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5432 9 archive 8 NaN NaN disk0/00/00/54/32 2012-11-19 20:28:01 2012-11-26 06:53:38 2012-11-19 20:28:01 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://jbc.jelenia-gora.pl/dlibra.html Digital Library of Jelenia Góra http://jbc.jelenia-gora.pl/dlibra/oai-pmh-repo... NaN NaN NaN Users may set up RSS feeds to be alerted to ne... NaN NaN NaN Jeleniogórskie Centrum Informacji i Edukacji R... http://biblioteka.jelenia-gora.pl/ pl NaN 50.9012 15.7341 NaN geoname_2_PL other NaN 2012-07-01 15:12:22 NaN NaN 0 0 0 20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19... NaN NaN NaN NaN [celestial, opendoar] [4595, 2211] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_5432
In [34]:
dup_hybrid = dup_hybrid.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_hybrid.head()
Out[34]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.metadata.cross-references FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.url-for-logo FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.data-access-condition.type FAIRsharing_attributes.metadata.data-contact-information FAIRsharing_attributes.metadata.data-deposition-condition.url FAIRsharing_attributes.metadata.data-deposition-condition.type FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.data-access-condition.url FAIRsharing_attributes.metadata.resource-sustainability.url FAIRsharing_attributes.metadata.resource-sustainability.name FAIRsharing_attributes.metadata.data-preservation-policy.url FAIRsharing_attributes.metadata.data-preservation-policy.name FAIRsharing_attributes.metadata.data-access-for-pre-publication-review FAIRsharing_attributes.metadata.data-versioning FAIRsharing_attributes.metadata.data-curation.type FAIRsharing_attributes.metadata.data-curation.url FAIRsharing_attributes.metadata.citation-to-related-publications FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_repository_metadata.repository_status OpenDOAR_repository_metadata.fulltext_record_count OpenDOAR_repository_metadata.metadata_record_count OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
0 dedup::038ef33e8d3de51d3536d62e6c103be7 roar::6167 6167 Institutional Repository UIN Syarif Hidayatull... roar roar_6167 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6167 13 archive 2178 NaN NaN disk0/00/00/61/67 2012-12-12 05:42:58 2013-07-14 15:12:12 2012-12-12 05:42:58 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://repository.uinjkt.ac.id Institutional Repository UIN Syarif Hidayatull... [http://repository.uinjkt.ac.id/oai, http://re... http://repository.uinjkt.ac.id/sword/ NaN NaN nstitutional Repository UIN Syarif Hidayatulla... TRUE TRUE FALSE [UIN Syarif Hidayatullah Jakarta, Pascasarjana... [http://www.uinjkt.ac.id, http://graduate.uinj... id Jakarta NaN NaN dspace geoname_2_ID other AI 2012-11-07 08:11:19 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [celestial, opendoar] [5108, 2717, 5109] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_6167
1 dedup::038ef33e8d3de51d3536d62e6c103be7 opendoar::2717 2717 institutional repository uin syarif hidayatull... OpenDOAR OpenDOAR_2717 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2717 {"name": "institutional repository uin syarif ... [] http://repository.uinjkt.ac.id/dspace/ NaN institutional [] 2022-01-12 15:35:36 2013-07-11 15:52:01 [science, arts, humanities, social sciences, h... [theses_and_dissertations] [{'name': 'uin syarif hidayatullah jakarta, st... [] {"name": "dspace", "version": ""} http://repository.uinjkt.ac.id/oai/ yes NaN 0.0 36862.0 OpenDOAR_2717 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 dedup::038ef33e8d3de51d3536d62e6c103be7 roar::6580 6580 Institutional Repository UIN Syarif Hidayatull... roar roar_6580 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6580 16 archive 2040 NaN NaN disk0/00/00/65/80 2013-03-31 16:02:51 2013-04-06 01:42:14 2013-03-31 16:02:51 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://repository.uinjkt.ac.id Institutional Repository UIN Syarif Hidayatull... NaN NaN NaN NaN NaN FALSE FALSE FALSE NaN NaN id Jakarta 106.756 -6.30591 dspace geoname_2_ID other [AC, Z665, Z004, Z719, BP, Q1] 2013-03-04 07:20:37 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_6580
3 dedup::044edcd1c961b3942a7e0e90d1005e2d roar::7902 7902 The University of Arizona Campus Repository roar roar_7902 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7902 12 archive 4910 NaN NaN disk0/00/00/79/02 2014-03-05 11:50:29 2014-05-08 13:10:29 2014-03-05 11:50:29 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://arizona.openrepository.com/arizona/ The University of Arizona Campus Repository http://arizona.openrepository.com/arizona/oai/... NaN http://arizona.openrepository.com/arizona/feed... NaN The UA Campus Repository is an institutional r... TRUE TRUE FALSE The University of Arizona http://www.arizona.edu/ us Tucson NaN NaN dspace geoname_2_US other NaN 2014-02-25 20:17:47 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [celestial, opendoar] [5404, http://opendoar.org/id/2468/] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN roar_7902
4 dedup::044edcd1c961b3942a7e0e90d1005e2d opendoar::2468 2468 university of arizona campus repository OpenDOAR OpenDOAR_2468 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2468 {"name": "university of arizona campus reposit... [] http://arizona.openrepository.com/arizona/ NaN institutional [] 2022-01-12 15:35:32 2012-05-02 09:50:07 [science, arts, humanities, health and medicin... [journal_articles, theses_and_dissertations, u... [{'name': 'university of arizona', 'alternativ... [] {"name": "other", "version": ""} NaN yes NaN NaN 63231.0 OpenDOAR_2468 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [35]:
dup_across = dup_across.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_across = dup_across.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_across = dup_across.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_across = dup_across.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_across.head()
Out[35]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.metadata.cross-references FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.url-for-logo FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.data-access-condition.type FAIRsharing_attributes.metadata.data-contact-information FAIRsharing_attributes.metadata.data-deposition-condition.url FAIRsharing_attributes.metadata.data-deposition-condition.type FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.data-access-condition.url FAIRsharing_attributes.metadata.resource-sustainability.url FAIRsharing_attributes.metadata.resource-sustainability.name FAIRsharing_attributes.metadata.data-preservation-policy.url FAIRsharing_attributes.metadata.data-preservation-policy.name FAIRsharing_attributes.metadata.data-access-for-pre-publication-review FAIRsharing_attributes.metadata.data-versioning FAIRsharing_attributes.metadata.data-curation.type FAIRsharing_attributes.metadata.data-curation.url FAIRsharing_attributes.metadata.citation-to-related-publications FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_repository_metadata.repository_status OpenDOAR_repository_metadata.fulltext_record_count OpenDOAR_repository_metadata.metadata_record_count OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id
0 dedup::001e6d882e54c780ce269d3c46997287 https://fairsharing.org/10.25504/FAIRsharing.q... 2094 RESID Database of Protein Modifications FAIRsharing FAIRsharing_2094 2094 fairsharing-records 2014-11-04T15:23:40.000Z 2021-12-06T10:49:03.952Z 10.25504/FAIRsharing.qaszjp RESID Database of Protein Modifications ready [{'contact-name': 'John S Garavelli', 'contact... http://pir.georgetown.edu/resid/ 2094.0 The RESID Database of Protein Modifications is... RESID [{'url': 'http://pir.georgetown.edu/resid/faq.... NaN [{'url': 'ftp://ftp.pir.georgetown.edu/pir_dat... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-000563, bsg-d000563] Database knowledgebase [Life Science] [Molecular structure, Small molecule, Structur... [All] [] [United Kingdom, European Union, Switzerland] FAIRsharing record for: RESID Database of Prot... RESID https://fairsharing.org/10.25504/FAIRsharing.q... 10.25504/FAIRsharing.qaszjp https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The RESID D... [{'id': 334, 'pubmed_id': 12520062, 'title': '... [{'licence-name': 'Open Data Commons (ODC) Pub... None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN FAIRsharing_2094 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 dedup::001e6d882e54c780ce269d3c46997287 re3data::r3d100011306 r3d100011306 RESID Database of Protein Modifications re3data re3data_r3d100011306 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN r3d100011306 RESID Database of Protein Modifications eng [] https://pir.georgetown.edu/resid/resid.shtml [FAIRsharing_doi:10.25504/FAIRsharing.qaszjp, ... ["pirmail@georgetown.edu"] The RESID Database of Protein Modifications is... eng [disciplinary] {"size": "", "updatedp": ""} 2014 NaN ["eng"] [{'name': '2 Life Sciences', 'scheme': 'DFG'},... NaN [{'name': 'Images', 'scheme': 'parse'}, {'name... [dataProvider] [genomes, life sciences, proteins, proteomes, ... [{'institutionName': 'Georgetown University, M... [{"policyName": "Terms of Use", "policyURL": "... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "Copyrights", "dataLicens... closed [] ["unknown"] yes {"api": "ftp://ftp.pir.georgetown.edu/database... ["none"] NaN [] yes unknown [] [] {} RESID is covered by Thomson Reuters Data Citat... 2014-12-05 2019-01-17 re3data_r3d100011306 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 dedup::003ab6b40af9b488decea7c582d150a2 re3data::r3d100011894 r3d100011894 Synapse re3data re3data_r3d100011894 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN r3d100011894 Synapse eng [] https://www.synapse.org [FAIRsharing_DOI:10.25504/FAIRsharing.dnxzmk, ... ["synapseinfo@sagebase.org"] Synapse is an open source software platform th... eng [other] {"size": "", "updatedp": ""} 2012-05-22 NaN ["eng"] [{'name': '2 Life Sciences', 'scheme': 'DFG'},... https://sagebionetworks.org/tools_resources/sy... [{'name': 'Raw data', 'scheme': 'parse'}, {'na... [dataProvider, serviceProvider] [AMP-AD Knowledge Portal, DREAM Challenges, Gi... [{'institutionName': 'Alfred P. Sloan Foundati... [{"policyName": "Synapse Commons Governance Ov... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "closed", "dataAccessRestr... [{"dataLicenseName": "other", "dataLicenseURL"... restricted [] ["unknown"] yes {"api": "https://docs.synapse.org/rest/", "api... ["DOI"] NaN [] yes yes [] [] {} NaN 2015-12-03 2021-11-16 re3data_r3d100011894 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 dedup::003ab6b40af9b488decea7c582d150a2 https://fairsharing.org/10.25504/FAIRsharing.d... 2315 Synapse FAIRsharing FAIRsharing_2315 2315 fairsharing-records 2016-08-02T13:56:30.000Z 2021-12-06T10:48:25.700Z 10.25504/FAIRsharing.dnxzmk Synapse ready [{'contact-name': 'Meredith Slota', 'contact-e... https://www.synapse.org/ 2315.0 Synapse is a collaborative research platform t... Synapse [{'url': 'SynapseInfo@sagebase.org', 'name': '... 2010.0 [{'url': 'https://www.synapse.org/', 'name': '... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-000791, bsg-d000791] Database repository [Data Integration, Data Management, Biomedical... [Experimental measurement, Protocol, Data stor... [All] [] [United States] FAIRsharing record for: Synapse Synapse https://fairsharing.org/10.25504/FAIRsharing.d... 10.25504/FAIRsharing.dnxzmk https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: Synapse is ... [{'id': 2450, 'pubmed_id': 24071850, 'title': ... [{'licence-name': 'Creative Commons Attributio... None NaN [{'url': 'https://sage-bionetworks.github.io/r... NaN NaN NaN NaN NaN NaN [{'url': 'http://rest-docs.synapse.org/rest/',... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN FAIRsharing_2315 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 dedup::0048f2e3aa55ab88aaaac0cfa4153ad5 opendoar::4562 4562 erzincan binali yıldırım university institutio... OpenDOAR OpenDOAR_4562 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4562 {"name": "erzincan binali y\u0131ld\u0131r\u01... [] http://earsiv.erzincan.edu.tr NaN institutional [] 2022-01-12 15:36:06 2019-04-24 09:06:10 [social sciences] [journal_articles] [{'name': 'erzincan binali yıldırım university... [] {"name": "dspace", "version": ""} http://earsiv.erzincan.edu.tr/oai yes NaN NaN NaN OpenDOAR_4562 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [36]:
dup_within = dup_within.groupby('dedup_id').aggregate(list).reset_index()
dup_within['source_set'] = dup_within.source.map(set)
dup_within.head()
Out[36]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.metadata.cross-references FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.url-for-logo FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.data-access-condition.type FAIRsharing_attributes.metadata.data-contact-information FAIRsharing_attributes.metadata.data-deposition-condition.url FAIRsharing_attributes.metadata.data-deposition-condition.type FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.data-access-condition.url FAIRsharing_attributes.metadata.resource-sustainability.url FAIRsharing_attributes.metadata.resource-sustainability.name FAIRsharing_attributes.metadata.data-preservation-policy.url FAIRsharing_attributes.metadata.data-preservation-policy.name FAIRsharing_attributes.metadata.data-access-for-pre-publication-review FAIRsharing_attributes.metadata.data-versioning FAIRsharing_attributes.metadata.data-curation.type FAIRsharing_attributes.metadata.data-curation.url FAIRsharing_attributes.metadata.citation-to-related-publications FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_repository_metadata.repository_status OpenDOAR_repository_metadata.fulltext_record_count OpenDOAR_repository_metadata.metadata_record_count OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id source_set
0 dedup::07b65089515c8f99812d14bbb01334a6 [roar::474, roar::5541] [474, 5541] [ECNIS Repository (Environmental Cancer Risk, ... [roar, roar] [roar_474, roar_5541] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [474, 5541] [281, 8] [archive, archive] [1, 8] [nan, nan] [nan, nan] [disk0/00/00/04/74, disk0/00/00/55/41] [2010-01-06 13:44:22, 2012-12-12 01:21:03] [2011-07-06 08:19:53, 2012-12-15 02:51:35] [2010-01-06 13:44:22, 2012-12-12 01:21:03] [other, institutional] [nan, nan] [nan, nan] [show, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [0, nan] [nan, nan] [nan, nan] [nan, nan] [http://ecnis.openrepository.com/, http://ecni... [ECNIS Repository (Environmental Cancer Risk, ... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [This site is a subject specific repository co... [TRUE, nan] [TRUE, nan] [nan, nan] [ECNIS (Environmental Cancer Risk, Nutrition a... [http://www.ecnis.org, http://www.ecnis.org/] [pl, pl] [Lodz, nan] [51.8, 51.8] [19.5, 19.5] [openrepo, nan] [geoname_2_PL, geoname_2_PL] [other, other] [nan, nan] [2008-06-03 08:05:43, 2012-07-01 15:13:36] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [opendoar, opendoar] [1254, 1254] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_474, roar_5541] {roar}
1 dedup::0be44aa69610e09805d4002baf7e0b10 [roar::16867, roar::2907] [16867, 2907] [Chung Shan Medical University Institutional R... [roar, roar] [roar_16867, roar_2907] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [16867, 2907] [3, 548] [archive, archive] [360, 360] [nan, nan] [nan, nan] [disk0/00/01/68/67, disk0/00/00/29/07] [2021-02-25 13:06:19, 2010-07-29 01:40:55] [2021-02-25 13:06:19, 2021-02-17 06:33:34] [2021-02-25 13:06:19, 2010-07-29 01:40:55] [institutional, institutional] [2907, nan] [nan, nan] [show, no_search] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [0, 0] [nan, nan] [nan, nan] [nan, nan] [https://ir.csmu.edu.tw:8080, https://ir.csmu.... [Chung Shan Medical University Institutional R... [https://ir.csmu.edu.tw:8080/ir-oai/request?ve... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [TRUE, TRUE] [TRUE, TRUE] [FALSE, FALSE] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [dspace, dspace] [nan, nan] [other, other] [[RT, RC0254, RC1200, R1, RK], [RC0321, RT, RC... [2009-10-21 00:00:00, 2009-10-21 00:00:00] [nan, nan] [nan, nan] [0, 0] [0, 0] [0, 0] [100, 100] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [celestial, celestial] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_16867, roar_2907] {roar}
2 dedup::0c34770edc42a1d2ac361b64cfabfb63 [roar::5432, roar::4030] [5432, 4030] [Digital Library of Jelenia Góra, Digital Libr... [roar, roar] [roar_5432, roar_4030] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [5432, 4030] [9, 12] [archive, archive] [8, 8] [nan, nan] [nan, nan] [disk0/00/00/54/32, disk0/00/00/40/30] [2012-11-19 20:28:01, 2011-08-02 23:17:15] [2012-11-26 06:53:38, 2012-02-06 06:58:00] [2012-11-19 20:28:01, 2011-08-02 23:17:15] [institutional, institutional] [nan, nan] [nan, nan] [show, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [http://jbc.jelenia-gora.pl/dlibra.html, http:... [Digital Library of Jelenia Góra, Digital Libr... [http://jbc.jelenia-gora.pl/dlibra/oai-pmh-rep... [nan, nan] [nan, nan] [nan, nan] [Users may set up RSS feeds to be alerted to n... [nan, nan] [nan, nan] [nan, nan] [Jeleniogórskie Centrum Informacji i Edukacji ... [http://biblioteka.jelenia-gora.pl/, http://bi... [pl, pl] [nan, nan] [50.9012, 50.9012] [15.7341, 15.7341] [nan, nan] [geoname_2_PL, geoname_2_PL] [other, other] [nan, nan] [2012-07-01 15:12:22, 2009-10-21 11:09:50] [nan, nan] [nan, nan] [0, 0] [0, 0] [0, 0] [20, 20] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1... [nan, nan] [nan, nan] [nan, nan] [nan, nan] celestial, opendoar], [celestial, opendoar 4595, 2211], [4595, 2211 [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_5432, roar_4030] {roar}
3 dedup::0c6ed4b110c461d9350bf5c620bc78d7 [roar::3020, roar::3401, roar::5252] [3020, 3401, 5252] [KCE Repository, KCE Repository, KCE Repository] [roar, roar, roar] [roar_3020, roar_3401, roar_5252] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [3020, 3401, 5252] [260, 82, 10] [archive, archive, archive] [8, 8, 8] [nan, nan, nan] [nan, nan, nan] [disk0/00/00/30/20, disk0/00/00/34/01, disk0/0... [2010-09-13 09:52:23, 2010-12-20 21:30:30, 201... [2016-04-17 21:53:51, 2016-04-17 21:51:59, 201... [2010-09-13 09:52:22, 2010-12-20 21:30:30, 201... [other, other, other] [nan, nan, nan] [nan, nan, nan] [show, show, show] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [0, 0, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [http://kce.docressources.info/opac/index.php?... [KCE Repository, KCE Repository, KCE Repository] [http://kce.docressources.info/ws/PMBWs_2, htt... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [This site provides access to the publication ... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [Belgian Health Care Knowledge Centre, Belgian... [http://www.kce.fgov.be/, http://www.kce.fgov.... [be, be, be] [nan, nan, nan] [50.8463, 50.8463, 50.8463] [4.3547, 4.3547, 4.3547] [nan, nan, nan] [nan, nan, geoname_2_BE] [other, other, other] [nan, nan, nan] [2009-01-19 09:04:11, 2009-01-19 09:04:11, 201... [nan, nan, nan] [nan, nan, nan] [0, 0, 0] [0, 0, 0] [0, 0, 0] [250, 250, 250] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [[celestial, opendoar], [celestial, opendoar],... 2246, 1879], [2246, 1879], [2246, 1879 [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... [909, 909, 909] [nan, nan, nan] [roar_3020, roar_3401, roar_5252] {roar}
4 dedup::0e3c63baca694032044bbb00c2f1111e [roar::8405, roar::8716] [8405, 8716] [Content Pro IRX, Content Pro IRX] [roar, roar] [roar_8405, roar_8716] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [8405, 8716] [17, 12] [archive, archive] [5386, 5386] [nan, nan] [nan, nan] [disk0/00/00/84/05, disk0/00/00/87/16] [2014-06-24 10:13:16, 2014-10-08 18:39:33] [2014-06-28 01:36:04, 2014-10-11 01:36:34] [2014-06-24 10:13:16, 2014-10-08 18:39:33] [institutional, institutional] [nan, nan] [nan, nan] [show, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [http://encore.tut.ac.za/iii/cpro, http://enco... [Content Pro IRX, Content Pro IRX] [nan, http://encore.tut.ac.za/iii/oairep/OAIRe... [http://encore.tut.ac.za/iii/cpro/, nan] [nan, nan] [nan, nan] [Tshwane University of Technology Digital Open... [TRUE, TRUE] [TRUE, TRUE] [TRUE, FALSE] [Tshwane University of Technology, Tshwane Uni... [http://lib.tut.ac.za, http://tut.ac.za] [za, za] [Pretoria, Pretoria] [-25, 25] [28, 28] [other, nan] [geoname_2_ZA, geoname_2_ZA] [other, other] [nan, nan] [2014-05-26 13:47:54, 2014-07-24 06:31:10] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [opendoar, celestial] [3078, 5657] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [roar_8405, roar_8716] {roar}
In [37]:
dup_hybrid = dup_hybrid.groupby('dedup_id').aggregate(list).reset_index()
dup_hybrid['source_set'] = dup_hybrid.source.map(set)
dup_hybrid.head()
Out[37]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.metadata.cross-references FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.url-for-logo FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.data-access-condition.type FAIRsharing_attributes.metadata.data-contact-information FAIRsharing_attributes.metadata.data-deposition-condition.url FAIRsharing_attributes.metadata.data-deposition-condition.type FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.data-access-condition.url FAIRsharing_attributes.metadata.resource-sustainability.url FAIRsharing_attributes.metadata.resource-sustainability.name FAIRsharing_attributes.metadata.data-preservation-policy.url FAIRsharing_attributes.metadata.data-preservation-policy.name FAIRsharing_attributes.metadata.data-access-for-pre-publication-review FAIRsharing_attributes.metadata.data-versioning FAIRsharing_attributes.metadata.data-curation.type FAIRsharing_attributes.metadata.data-curation.url FAIRsharing_attributes.metadata.citation-to-related-publications FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_repository_metadata.repository_status OpenDOAR_repository_metadata.fulltext_record_count OpenDOAR_repository_metadata.metadata_record_count OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id source_set
0 dedup::038ef33e8d3de51d3536d62e6c103be7 [roar::6167, opendoar::2717, roar::6580] [6167, 2717, 6580] [Institutional Repository UIN Syarif Hidayatul... [roar, OpenDOAR, roar] [roar_6167, OpenDOAR_2717, roar_6580] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, 2717, nan] [nan, {"name": "institutional repository uin s... [nan, [], nan] [nan, http://repository.uinjkt.ac.id/dspace/, ... [nan, nan, nan] [nan, institutional, nan] [nan, [], nan] [nan, 2022-01-12 15:35:36, nan] [nan, 2013-07-11 15:52:01, nan] [nan, [science, arts, humanities, social scien... [nan, [theses_and_dissertations], nan] [nan, [{'name': 'uin syarif hidayatullah jakar... [nan, [], nan] [nan, {"name": "dspace", "version": ""}, nan] [nan, http://repository.uinjkt.ac.id/oai/, nan] [nan, yes, nan] [nan, nan, nan] [nan, 0.0, nan] [nan, 36862.0, nan] [nan, OpenDOAR_2717, nan] [6167, nan, 6580] [13, nan, 16] [archive, nan, archive] [2178, nan, 2040] [nan, nan, nan] [nan, nan, nan] [disk0/00/00/61/67, nan, disk0/00/00/65/80] [2012-12-12 05:42:58, nan, 2013-03-31 16:02:51] [2013-07-14 15:12:12, nan, 2013-04-06 01:42:14] [2012-12-12 05:42:58, nan, 2013-03-31 16:02:51] [institutional, nan, institutional] [nan, nan, nan] [nan, nan, nan] [show, nan, show] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [http://repository.uinjkt.ac.id, nan, http://r... [Institutional Repository UIN Syarif Hidayatul... [[http://repository.uinjkt.ac.id/oai, http://r... [http://repository.uinjkt.ac.id/sword/, nan, nan] [nan, nan, nan] [nan, nan, nan] [nstitutional Repository UIN Syarif Hidayatull... [TRUE, nan, FALSE] [TRUE, nan, FALSE] [FALSE, nan, FALSE] [[UIN Syarif Hidayatullah Jakarta, Pascasarjan... [[http://www.uinjkt.ac.id, http://graduate.uin... [id, nan, id] [Jakarta, nan, Jakarta] [nan, nan, 106.756] [nan, nan, -6.30591] [dspace, nan, dspace] [geoname_2_ID, nan, geoname_2_ID] [other, nan, other] [AI, nan, [AC, Z665, Z004, Z719, BP, Q1]] [2012-11-07 08:11:19, nan, 2013-03-04 07:20:37] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [[celestial, opendoar], nan, nan] [[5108, 2717, 5109], nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [roar_6167, nan, roar_6580] {OpenDOAR, roar}
1 dedup::044edcd1c961b3942a7e0e90d1005e2d [roar::7902, opendoar::2468, roar::5216] [7902, 2468, 5216] [The University of Arizona Campus Repository, ... [roar, OpenDOAR, roar] [roar_7902, OpenDOAR_2468, roar_5216] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, 2468, nan] [nan, {"name": "university of arizona campus r... [nan, [], nan] [nan, http://arizona.openrepository.com/arizon... [nan, nan, nan] [nan, institutional, nan] [nan, [], nan] [nan, 2022-01-12 15:35:32, nan] [nan, 2012-05-02 09:50:07, nan] [nan, [science, arts, humanities, health and m... [nan, [journal_articles, theses_and_dissertati... [nan, [{'name': 'university of arizona', 'alte... [nan, [], nan] [nan, {"name": "other", "version": ""}, nan] [nan, nan, nan] [nan, yes, nan] [nan, nan, nan] [nan, nan, nan] [nan, 63231.0, nan] [nan, OpenDOAR_2468, nan] [7902, nan, 5216] [12, nan, 8] [archive, nan, archive] [4910, nan, 8] [nan, nan, nan] [nan, nan, nan] [disk0/00/00/79/02, nan, disk0/00/00/52/16] [2014-03-05 11:50:29, nan, 2012-05-16 23:47:28] [2014-05-08 13:10:29, nan, 2012-05-19 01:46:06] [2014-03-05 11:50:29, nan, 2012-05-16 23:47:28] [institutional, nan, institutional] [nan, nan, nan] [nan, nan, nan] [show, nan, show] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [http://arizona.openrepository.com/arizona/, n... [The University of Arizona Campus Repository, ... [http://arizona.openrepository.com/arizona/oai... [nan, nan, nan] [http://arizona.openrepository.com/arizona/fee... [nan, nan, nan] [The UA Campus Repository is an institutional ... [TRUE, nan, nan] [TRUE, nan, nan] [FALSE, nan, nan] [The University of Arizona, nan, University of... [http://www.arizona.edu/, nan, http://www.ariz... [us, nan, us] [Tucson, nan, nan] [nan, nan, 32.2531] [nan, nan, -110.948] [dspace, nan, nan] [geoname_2_US, nan, geoname_2_US] [other, nan, other] [nan, nan, nan] [2014-02-25 20:17:47, nan, 2012-05-13 15:12:37] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [[celestial, opendoar], nan, opendoar] [[5404, http://opendoar.org/id/2468/], nan, 2468] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [roar_7902, nan, roar_5216] {OpenDOAR, roar}
2 dedup::0468c62a26a75be73109e1efa74bee44 [roar::12182, opendoar::3096, roar::8677] [12182, 3096, 8677] [ScholarWorks @ UVM, scholarworks @ uvm, Schol... [roar, OpenDOAR, roar] [roar_12182, OpenDOAR_3096, roar_8677] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, 3096, nan] [nan, {"name": "scholarworks @ uvm", "language... [nan, [], nan] [nan, http://scholarworks.uvm.edu/, nan] [nan, nan, nan] [nan, institutional, nan] [nan, [], nan] [nan, 2022-01-12 15:35:42, nan] [nan, 2014-06-26 16:50:45, nan] [nan, [science, technology, engineering, mathe... [nan, [journal_articles, conference_and_worksh... [nan, [{'name': 'university of vermont', 'alte... [nan, [{"policy_url": "http://scholarworks.uvm... [nan, {"name": "other", "version": ""}, nan] [nan, http://scholarworks.uvm.edu/do/oai/, nan] [nan, yes, nan] [nan, nan, nan] [nan, nan, nan] [nan, 2871.0, nan] [nan, OpenDOAR_3096, nan] [12182, nan, 8677] [11, nan, 11] [archive, nan, archive] [404, nan, 5634] [nan, nan, nan] [nan, nan, nan] [disk0/00/01/21/82, nan, disk0/00/00/86/77] [2017-03-11 17:50:59, nan, 2014-10-08 18:32:00] [2017-03-18 02:36:55, nan, 2014-10-11 01:36:04] [2017-03-11 17:50:59, nan, 2014-10-08 18:32:00] [institutional, nan, institutional] [nan, nan, nan] [nan, nan, nan] [show, nan, show] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [http://scholarworks.uvm.edu/, nan, http://sch... [ScholarWorks @ UVM, nan, ScholarWorks @ UVM] [http://scholarworks.uvm.edu/do/oai/, nan, htt... [nan, nan, nan] [http://scholarworks.uvm.edu/recent.rss, nan, ... [nan, nan, nan] [ScholarWorks @ UVM collects, preserves, and s... [TRUE, nan, TRUE] [TRUE, nan, TRUE] [FALSE, nan, FALSE] [University of Vermont, nan, University of Ver... [https://www.uvm.edu/, nan, http://www.uvm.edu] [us, nan, us] [Burlington, VT, nan, Burington] [44.4759, nan, 44.4856] [-73.2121, nan, -73.2117] [bepress, nan, bepress] [geoname_2_US, nan, geoname_2_US] [other, nan, other] [nan, nan, nan] [2017-01-13 20:44:06, nan, 2014-07-16 21:08:43] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [celestial, nan, [celestial, opendoar]] [5654, nan, [http://opendoar.org/id/3096/, 5654]] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [roar_12182, nan, roar_8677] {OpenDOAR, roar}
3 dedup::053eb8ab14c76525fd6f1daeb061f064 [opendoar::9528, roar::15805, roar::15765] [9528, 15805, 15765] [repositorio institucional históricas - unam, ... [OpenDOAR, roar, roar] [OpenDOAR_9528, roar_15805, roar_15765] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [9528, nan, nan] [{"name": "repositorio institucional hist\u00f... [[], nan, nan] [http://ru.historicas.unam.mx, nan, nan] [nan, nan, nan] [institutional, nan, nan] [[], nan, nan] [2022-01-12 15:36:31, nan, nan] [2020-02-25 08:36:10, nan, nan] [[humanities, technology], nan, nan] [[journal_articles, other_special_item_types],... [[{'name': 'unam', 'alternativeName': 'institu... [[], nan, nan] [{"name": "dspace", "version": ""}, nan, nan] [http://ru.historicas.unam.mx/oai/request, nan... [yes, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [OpenDOAR_9528, nan, nan] [nan, 15805, 15765] [nan, 5, 19] [nan, archive, archive] [nan, 12662, 12662] [nan, nan, nan] [nan, nan, nan] [nan, disk0/00/01/58/05, disk0/00/01/57/65] [nan, 2020-10-19 15:32:48, 2020-10-19 15:31:52] [nan, 2021-01-25 22:20:40, 2021-01-26 20:47:24] [nan, 2020-10-19 15:32:48, 2020-10-19 15:31:52] [nan, institutional, institutional] [nan, 15765, nan] [nan, nan, nan] [nan, show, no_search] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, http://ru.historicas.unam.mx/, https://r... [nan, Repositorio Institucional Históricas-UNA... [nan, http://ru.historicas.unam.mx/oai/request... [nan, nan, nan] [nan, http://ru.historicas.unam.mx/feed/rss_1.... [nan, nan, nan] [nan, El Repositorio Institucional Históricas-... [nan, TRUE, TRUE] [nan, TRUE, TRUE] [nan, TRUE, TRUE] [nan, Instituto de Investigaciones Históricas,... [nan, http://www.historicas.unam.mx/, http://w... [nan, mx, mx] [nan, Ciudad de México, Ciudad de México] [nan, nan, nan] [nan, nan, nan] [nan, dspace, dspace] [nan, geoname_2_MX, geoname_2_MX] [nan, other, other] [nan, [D1, E11, F1201, D111, D901, DP, D204, D... [nan, 2020-02-14 18:36:03, 2020-02-14 18:36:03] [nan, ¿Quién puede depositar documentos en el ... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, opendoar] [nan, nan, 9528] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, [russell_group, ivy_league], [russell_gr... [nan, roar_15805, roar_15765] {OpenDOAR, roar}
4 dedup::06a4be0dca480e71b823fd599ed221a0 [opendoar::2557, roar::5840, roar::5915] [2557, 5840, 5915] [biblioteka cyfrowa diecezji legnickiej, Bibli... [OpenDOAR, roar, roar] [OpenDOAR_2557, roar_5840, roar_5915] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [2557, nan, nan] [{"name": "biblioteka cyfrowa diecezji legnick... [[], nan, nan] [http://bcdl.pl/dlibra, nan, nan] [nan, nan, nan] [institutional, nan, nan] [[], nan, nan] [2022-01-12 15:35:34, nan, nan] [2012-08-20 11:35:42, nan, nan] [[humanities], nan, nan] [[journal_articles, books_chapters_and_section... [[{'name': 'biblioteka wyższego seminarium duc... [[], nan, nan] [{"name": "dlibra", "version": ""}, nan, nan] [http://bcdl.pl/dlibra/oai-pmh-repository.xml,... [yes, nan, nan] [nan, nan, nan] [0.0, nan, nan] [368.0, nan, nan] [OpenDOAR_2557, nan, nan] [nan, 5840, 5915] [nan, 9, 9] [nan, archive, archive] [nan, 8, 8] [nan, nan, nan] [nan, nan, nan] [nan, disk0/00/00/58/40, disk0/00/00/59/15] [nan, 2012-12-12 04:59:36, 2012-12-12 05:12:30] [nan, 2012-12-17 06:53:45, 2012-12-17 06:53:50] [nan, 2012-12-12 04:59:36, 2012-12-12 05:12:30] [nan, institutional, institutional] [nan, nan, nan] [nan, nan, nan] [nan, show, show] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, http://bcdl.pl/, http://bcdl.pl/dlibra] [nan, Biblioteka Cyfrowa Diecezji Legnickiej, ... [nan, http://bcdl.pl/dlibra/oai-pmh-repository... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, This site provides access to the digitis... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, Biblioteka Wyższego Seminarium Duchowneg... [nan, http://www.biblioteka.diecezja.legnica.p... [nan, pl, pl] [nan, nan, nan] [nan, 51.207, 51.207] [nan, 16.1553, 16.1553] [nan, nan, nan] [nan, geoname_2_PL, geoname_2_PL] [nan, other, other] [nan, nan, nan] [nan, 2012-08-26 15:12:13, 2012-09-16 15:12:16] [nan, nan, nan] [nan, nan, nan] [nan, 0, 0] [nan, 0, 0] [nan, 0, 0] [nan, 20, 20] [nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, [celestial, opendoar], [celestial, opend... [nan, [2557, 5081], [2557, 5081]] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, nan, nan] [nan, roar_5840, roar_5915] {OpenDOAR, roar}
In [38]:
dup_across = dup_across.groupby('dedup_id').aggregate(list).reset_index()
dup_across['source_set'] = dup_across.source.map(set)
dup_across.head()
Out[38]:
dedup_id duplicate_id original_id name source unique_id FAIRsharing_id FAIRsharing_type FAIRsharing_attributes.created-at FAIRsharing_attributes.updated-at FAIRsharing_attributes.metadata.doi FAIRsharing_attributes.metadata.name FAIRsharing_attributes.metadata.status FAIRsharing_attributes.metadata.contacts FAIRsharing_attributes.metadata.homepage FAIRsharing_attributes.metadata.identifier FAIRsharing_attributes.metadata.description FAIRsharing_attributes.metadata.abbreviation FAIRsharing_attributes.metadata.support-links FAIRsharing_attributes.metadata.year-creation FAIRsharing_attributes.metadata.data-processes FAIRsharing_attributes.metadata.cross-references FAIRsharing_attributes.legacy-ids FAIRsharing_attributes.fairsharing-registry FAIRsharing_attributes.record-type FAIRsharing_attributes.subjects FAIRsharing_attributes.domains FAIRsharing_attributes.taxonomies FAIRsharing_attributes.user-defined-tags FAIRsharing_attributes.countries FAIRsharing_attributes.name FAIRsharing_attributes.abbreviation FAIRsharing_attributes.url FAIRsharing_attributes.doi FAIRsharing_attributes.fairsharing-licence FAIRsharing_attributes.description FAIRsharing_attributes.publications FAIRsharing_attributes.licence-links FAIRsharing_attributes.url-for-logo FAIRsharing_attributes.metadata.citations FAIRsharing_attributes.metadata.associated-tools FAIRsharing_attributes.metadata.deprecation-reason FAIRsharing_attributes.metadata.data-access-condition.type FAIRsharing_attributes.metadata.data-contact-information FAIRsharing_attributes.metadata.data-deposition-condition.url FAIRsharing_attributes.metadata.data-deposition-condition.type FAIRsharing_attributes.metadata.deprecation-date FAIRsharing_attributes.metadata.access-points FAIRsharing_attributes.metadata.data-access-condition.url FAIRsharing_attributes.metadata.resource-sustainability.url FAIRsharing_attributes.metadata.resource-sustainability.name FAIRsharing_attributes.metadata.data-preservation-policy.url FAIRsharing_attributes.metadata.data-preservation-policy.name FAIRsharing_attributes.metadata.data-access-for-pre-publication-review FAIRsharing_attributes.metadata.data-versioning FAIRsharing_attributes.metadata.data-curation.type FAIRsharing_attributes.metadata.data-curation.url FAIRsharing_attributes.metadata.citation-to-related-publications FAIRsharing_attributes.metadata.tombstone FAIRsharing_unique_id re3data_orgIdentifier re3data_repositoryName re3data_repositoryName.language re3data_additionalName re3data_repositoryURL re3data_repositoryIdentifier re3data_repositoryContact re3data_description re3data_description.language re3data_type re3data_size re3data_startDate re3data_endDate re3data_repositoryLanguage re3data_subject re3data_missionStatementURL re3data_contentType re3data_providerType re3data_keyword re3data_institution re3data_policy re3data_databaseAccess re3data_databaseLicense re3data_dataAccess re3data_dataLicense re3data_dataUploadType re3data_dataUploadLicense re3data_software re3data_versioning re3data_api re3data_pidSystem re3data_citationGuidelineURL re3data_aidSystem re3data_enhancedPublication re3data_qualityManagement re3data_certificate re3data_metadataStandard re3data_syndication re3data_remarks re3data_entryDate re3data_lastUpdate re3data_unique_id OpenDOAR_system_metadata.id OpenDOAR_repository_metadata.name OpenDOAR_repository_metadata.alternativename OpenDOAR_repository_metadata.url OpenDOAR_repository_metadata.description OpenDOAR_repository_metadata.type OpenDOAR_repository_metadata.content_languages OpenDOAR_system_metadata.date_modified OpenDOAR_system_metadata.date_created OpenDOAR_repository_metadata.content_subjects OpenDOAR_repository_metadata.content_types OpenDOAR_organization OpenDOAR_policy_urls OpenDOAR_repository_metadata.software OpenDOAR_repository_metadata.oai_url OpenDOAR_system_metadata.publicly_visible OpenDOAR_repository_metadata.repository_status OpenDOAR_repository_metadata.fulltext_record_count OpenDOAR_repository_metadata.metadata_record_count OpenDOAR_unique_id roar_eprintid roar_rev_number roar_eprint_status roar_userid roar_importid roar_source roar_dir roar_datestamp roar_lastmod roar_status_changed roar_type roar_succeeds roar_commentary roar_metadata_visibility roar_latitude roar_longitude roar_relation_type roar_relation_uri roar_item_issues_id roar_item_issues_type roar_item_issues_description roar_item_issues_timestamp roar_item_issues_status roar_item_issues_reported_by roar_item_issues_resolved_by roar_item_issues_comment roar_item_issues_count roar_sword_depositor roar_sword_slug roar_exemplar roar_home_page roar_title roar_oai_pmh roar_sword_endpoint roar_rss_feed roar_twitter_feed roar_description roar_fulltext roar_open_access roar_mandate roar_organisation_title roar_organisation_home_page roar_location_country roar_location_city roar_location_latitude roar_location_longitude roar_software roar_geoname roar_version roar_subjects roar_date roar_note roar_suggestions roar_activity_low roar_activity_medium roar_activity_high roar_recordcount roar_recordhistory roar_fulltexts_total roar_fulltexts_docs roar_fulltexts_rtotal roar_fulltexts_rdocs roar_registry_name roar_registry_id roar_submit_to roar_submitted_to_name roar_submitted_to_done roar_webometrics_rank roar_webometrics_size roar_webometrics_visibility roar_webometrics_rich_files roar_webometrics_scholar roar_monthly_deposits roar_total_deposits roar_association roar_unique_id source_set
0 dedup::001e6d882e54c780ce269d3c46997287 [https://fairsharing.org/10.25504/FAIRsharing.... [2094, r3d100011306] [RESID Database of Protein Modifications, RESI... [FAIRsharing, re3data] [FAIRsharing_2094, re3data_r3d100011306] [2094, nan] [fairsharing-records, nan] [2014-11-04T15:23:40.000Z, nan] [2021-12-06T10:49:03.952Z, nan] [10.25504/FAIRsharing.qaszjp, nan] [RESID Database of Protein Modifications, nan] [ready, nan] [[{'contact-name': 'John S Garavelli', 'contac... [http://pir.georgetown.edu/resid/, nan] [2094.0, nan] [The RESID Database of Protein Modifications i... [RESID, nan] [[{'url': 'http://pir.georgetown.edu/resid/faq... [nan, nan] [[{'url': 'ftp://ftp.pir.georgetown.edu/pir_da... [[{'url': 'https://www.re3data.org/repository/... [[biodbcore-000563, bsg-d000563], nan] [Database, nan] [knowledgebase, nan] [[Life Science], nan] [[Molecular structure, Small molecule, Structu... [[All], nan] [[], nan] [[United Kingdom, European Union, Switzerland]... [FAIRsharing record for: RESID Database of Pro... [RESID, nan] [https://fairsharing.org/10.25504/FAIRsharing.... [10.25504/FAIRsharing.qaszjp, nan] [https://creativecommons.org/licenses/by-sa/4.... [This FAIRsharing record describes: The RESID ... [[{'id': 334, 'pubmed_id': 12520062, 'title': ... [[{'licence-name': 'Open Data Commons (ODC) Pu... [None, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [FAIRsharing_2094, nan] [nan, r3d100011306] [nan, RESID Database of Protein Modifications] [nan, eng] [nan, []] [nan, https://pir.georgetown.edu/resid/resid.s... [nan, [FAIRsharing_doi:10.25504/FAIRsharing.qa... [nan, ["pirmail@georgetown.edu"]] [nan, The RESID Database of Protein Modificati... [nan, eng] [nan, [disciplinary]] [nan, {"size": "", "updatedp": ""}] [nan, 2014] [nan, nan] [nan, ["eng"]] [nan, [{'name': '2 Life Sciences', 'scheme': '... [nan, nan] [nan, [{'name': 'Images', 'scheme': 'parse'}, ... [nan, [dataProvider]] [nan, [genomes, life sciences, proteins, prote... [nan, [{'institutionName': 'Georgetown Univers... [nan, [{"policyName": "Terms of Use", "policyU... [nan, {"databaseAccessType": "open", "databas... [nan, []] [nan, [{"dataAccessType": "open", "dataAccessR... [nan, [{"dataLicenseName": "Copyrights", "data... [nan, closed] [nan, []] [nan, ["unknown"]] [nan, yes] [nan, {"api": "ftp://ftp.pir.georgetown.edu/da... [nan, ["none"]] [nan, nan] [nan, []] [nan, yes] [nan, unknown] [nan, []] [nan, []] [nan, {}] [nan, RESID is covered by Thomson Reuters Data... [nan, 2014-12-05] [nan, 2019-01-17] [nan, re3data_r3d100011306] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] {FAIRsharing, re3data}
1 dedup::003ab6b40af9b488decea7c582d150a2 [re3data::r3d100011894, https://fairsharing.or... [r3d100011894, 2315] [Synapse, Synapse] [re3data, FAIRsharing] [re3data_r3d100011894, FAIRsharing_2315] [nan, 2315] [nan, fairsharing-records] [nan, 2016-08-02T13:56:30.000Z] [nan, 2021-12-06T10:48:25.700Z] [nan, 10.25504/FAIRsharing.dnxzmk] [nan, Synapse] [nan, ready] [nan, [{'contact-name': 'Meredith Slota', 'con... [nan, https://www.synapse.org/] [nan, 2315.0] [nan, Synapse is a collaborative research plat... [nan, Synapse] [nan, [{'url': 'SynapseInfo@sagebase.org', 'na... [nan, 2010.0] [nan, [{'url': 'https://www.synapse.org/', 'na... [nan, [{'url': 'https://www.re3data.org/reposi... [nan, [biodbcore-000791, bsg-d000791]] [nan, Database] [nan, repository] [nan, [Data Integration, Data Management, Biom... [nan, [Experimental measurement, Protocol, Dat... [nan, [All]] [nan, []] [nan, [United States]] [nan, FAIRsharing record for: Synapse] [nan, Synapse] [nan, https://fairsharing.org/10.25504/FAIRsha... [nan, 10.25504/FAIRsharing.dnxzmk] [nan, https://creativecommons.org/licenses/by-... [nan, This FAIRsharing record describes: Synap... [nan, [{'id': 2450, 'pubmed_id': 24071850, 'ti... [nan, [{'licence-name': 'Creative Commons Attr... [nan, None] [nan, nan] [nan, [{'url': 'https://sage-bionetworks.githu... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, [{'url': 'http://rest-docs.synapse.org/r... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, FAIRsharing_2315] [r3d100011894, nan] [Synapse, nan] [eng, nan] [[], nan] [https://www.synapse.org, nan] [[FAIRsharing_DOI:10.25504/FAIRsharing.dnxzmk,... [["synapseinfo@sagebase.org"], nan] [Synapse is an open source software platform t... [eng, nan] [[other], nan] [{"size": "", "updatedp": ""}, nan] [2012-05-22, nan] [nan, nan] [["eng"], nan] [[{'name': '2 Life Sciences', 'scheme': 'DFG'}... [https://sagebionetworks.org/tools_resources/s... [[{'name': 'Raw data', 'scheme': 'parse'}, {'n... [[dataProvider, serviceProvider], nan] [[AMP-AD Knowledge Portal, DREAM Challenges, G... [[{'institutionName': 'Alfred P. Sloan Foundat... [[{"policyName": "Synapse Commons Governance O... [ {"databaseAccessType": "open", "databaseAcce... [[], nan] [[{"dataAccessType": "closed", "dataAccessRest... [[{"dataLicenseName": "other", "dataLicenseURL... [restricted, nan] [[], nan] [["unknown"], nan] [yes, nan] [{"api": "https://docs.synapse.org/rest/", "ap... [["DOI"], nan] [nan, nan] [[], nan] [yes, nan] [yes, nan] [[], nan] [[], nan] [{}, nan] [nan, nan] [2015-12-03, nan] [2021-11-16, nan] [re3data_r3d100011894, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] {FAIRsharing, re3data}
2 dedup::0048f2e3aa55ab88aaaac0cfa4153ad5 [opendoar::4562, roar::14673] [4562, 14673] [erzincan binali yıldırım university instituti... [OpenDOAR, roar] [OpenDOAR_4562, roar_14673] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [4562, nan] [{"name": "erzincan binali y\u0131ld\u0131r\u0... [[], nan] [http://earsiv.erzincan.edu.tr, nan] [nan, nan] [institutional, nan] [[], nan] [2022-01-12 15:36:06, nan] [2019-04-24 09:06:10, nan] [[social sciences], nan] [[journal_articles], nan] [[{'name': 'erzincan binali yıldırım universit... [[], nan] [{"name": "dspace", "version": ""}, nan] [http://earsiv.erzincan.edu.tr/oai, nan] [yes, nan] [nan, nan] [nan, nan] [nan, nan] [OpenDOAR_4562, nan] [nan, 14673] [nan, 9] [nan, archive] [nan, 11738] [nan, nan] [nan, nan] [nan, disk0/00/01/46/73] [nan, 2019-07-19 14:26:33] [nan, 2019-07-22 08:07:03] [nan, 2019-07-19 14:26:33] [nan, institutional] [nan, nan] [nan, nan] [nan, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, http://earsiv.erzincan.edu.tr] [nan, Erzincan Binali Yıldırım University Inst... [nan, http://earsiv.erzincan.edu.tr/oai] [nan, nan] [nan, nan] [nan, nan] [nan, DSpace@Erzincan is a growing collection ... [nan, TRUE] [nan, TRUE] [nan, TRUE] [nan, Erzincan Binali Yıldırım University] [nan, http://www.ebyu.edu.tr] [nan, tr] [nan, Erzincan] [nan, 39.7463] [nan, 39.5149] [nan, dspace] [nan, geoname_2_TR] [nan, other] [nan, nan] [nan, 2019-04-18 19:09:41] [nan, DSpace@Erzincan is a growing collection ... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, roar_14673] {OpenDOAR, roar}
3 dedup::00a35b4a2495a342f5632d18cf5985f6 [opendoar::6787, roar::13960] [6787, 13960] [scholarly commons university of the pacific, ... [OpenDOAR, roar] [OpenDOAR_6787, roar_13960] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [6787, nan] [{"name": "scholarly commons university of the... [[], nan] [https://scholarlycommons.pacific.edu, nan] [nan, nan] [institutional, nan] [[], nan] [2022-01-12 15:36:16, nan] [2019-09-28 02:20:20, nan] [[science, technology, engineering, mathematic... [[journal_articles, theses_and_dissertations, ... [[{'name': 'university of the pacific', 'alter... [[], nan] [{"name": "digital_commons", "version": ""}, nan] [https://scholarlycommons.pacific.edu/do/oai, ... [yes, nan] [nan, nan] [6534.0, nan] [59858.0, nan] [OpenDOAR_6787, nan] [nan, 13960] [nan, 10] [nan, archive] [nan, 11103] [nan, nan] [nan, nan] [nan, disk0/00/01/39/60] [nan, 2018-10-07 12:48:04] [nan, 2018-10-13 01:35:50] [nan, 2018-10-07 12:48:04] [nan, institutional] [nan, nan] [nan, nan] [nan, show] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, https://scholarlycommons.pacific.edu/] [nan, Scholarly Commons - University of the Pa... [nan, https://scholarlycommons.pacific.edu/do/... [nan, nan] [nan, https://scholarlycommons.pacific.edu/rec... [nan, nan] [nan, Scholarly Commons is a service of the Un... [nan, TRUE] [nan, TRUE] [nan, FALSE] [nan, University of the Pacific] [nan, https://www.pacific.edu/] [nan, us] [nan, [Sacramento, Stockton, San Francisco]] [nan, nan] [nan, nan] [nan, bepress] [nan, geoname_2_US] [nan, other] [nan, nan] [nan, 2018-09-05 23:09:53] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, celestial] [nan, 6700] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, roar_13960] {OpenDOAR, roar}
4 dedup::00a6af15fba302b272b110ac88924779 [roar::755, opendoar::1285] [755, 1285] [KFUPM ePrints, kfupm eprints] [roar, OpenDOAR] [roar_755, OpenDOAR_1285] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, 1285] [nan, {"name": "kfupm eprints", "language": "e... [nan, []] [nan, http://eprints.kfupm.edu.sa/] [nan, nan] [nan, institutional] [nan, []] [nan, 2022-01-12 15:35:13] [nan, 2008-07-18 11:11:28] [nan, [science, humanities, mathematics, techn... [nan, [journal_articles, conference_and_worksh... [nan, [{'name': 'king fahd university of petro... [nan, []] [nan, {"name": "eprints", "version": ""}] [nan, http://eprints.kfupm.edu.sa/cgi/oai2] [nan, yes] [nan, nan] [nan, 4890.0] [nan, 6221.0] [nan, OpenDOAR_1285] [755, nan] [511, nan] [archive, nan] [1, nan] [nan, nan] [nan, nan] [disk0/00/00/07/55, nan] [2010-01-06 13:44:43, nan] [2011-07-18 05:50:28, nan] [2010-01-06 13:44:43, nan] [institutional, nan] [nan, nan] [nan, nan] [show, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [nan, nan] [0, nan] [nan, nan] [nan, nan] [nan, nan] [http://eprints.kfupm.edu.sa/, nan] [KFUPM ePrints, nan] [http://eprints.kfupm.edu.sa/perl/oai2, nan] [nan, nan] [http://eprints.kfupm.edu.sa/cgi/latest_tool?o... [nan, nan] [nan, nan] [TRUE, nan] [TRUE, nan] [nan, nan] [King Fahd University of Petroleum and Mineral... [http://www.kfupm.edu.sa, nan] [sa, nan] [ DHAHRAN, nan] [17.4333, nan] [43.2167, nan] [eprints, nan] [geoname_2_SA, nan] [3.2.7, nan] [nan, nan] [2007-09-10 11:33:13, nan] [nan, nan] [nan, nan] [0, nan] [0, nan] [0, nan] [100, nan] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100... [nan, nan] [nan, nan] [nan, nan] [nan, nan] [[celestial, opendoar], nan] [[1234, 1285], nan] [nan, nan] [nan, nan] [nan, nan] [37, nan] [26, nan] [69, nan] [23, nan] [203, nan] [nan, nan] [nan, nan] [nan, nan] [roar_755, nan] {OpenDOAR, roar}
In [39]:
def remove_nan(list_obj):
    if isinstance(list_obj, list):
        while np.nan in list_obj:
            list_obj.remove(np.nan)
    return list_obj
    
dup_within.applymap(remove_nan).to_csv('../data/processed/dup_within.csv')
dup_hybrid.applymap(remove_nan).to_csv('../data/processed/dup_hybrid.csv')
dup_across.applymap(remove_nan).to_csv('../data/processed/dup_across.csv')