792 KiB
792 KiB
In [1]:
import ast
import csv
import json
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
Loading data from registries¶
In [2]:
with open('../data/raw/fairsharing_dump_api_02_2022.json') as f:
lines = f.read().splitlines()
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))
fairsharing_df['unique_id'] = 'FAIRsharing_' + fairsharing_df.id
fairsharing_df = fairsharing_df.add_prefix('FAIRsharing_')
fairsharing_df.head()
Out[2]:
FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.metadata.cross-references | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.url-for-logo | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.data-access-condition.type | FAIRsharing_attributes.metadata.data-contact-information | FAIRsharing_attributes.metadata.data-deposition-condition.url | FAIRsharing_attributes.metadata.data-deposition-condition.type | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.data-access-condition.url | FAIRsharing_attributes.metadata.resource-sustainability.url | FAIRsharing_attributes.metadata.resource-sustainability.name | FAIRsharing_attributes.metadata.data-preservation-policy.url | FAIRsharing_attributes.metadata.data-preservation-policy.name | FAIRsharing_attributes.metadata.data-access-for-pre-publication-review | FAIRsharing_attributes.metadata.data-versioning | FAIRsharing_attributes.metadata.data-curation.type | FAIRsharing_attributes.metadata.data-curation.url | FAIRsharing_attributes.metadata.citation-to-related-publications | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3226 | fairsharing-records | 2020-12-09T11:53:44.000Z | 2022-02-08T10:42:36.452Z | 10.25504/FAIRsharing.d6423b | WDC Sunspot Index and Long-term Solar Observat... | ready | [{'contact-name': 'Frédéric Clette', 'contact-... | http://sidc.be/silso/home | 3226 | The WDC-SILSO is an activity of the Operationa... | WDC-SILSO | [{'url': 'http://www.sidc.be/silso/taxonomy/te... | 2013.0 | [{'url': 'http://www.sidc.be/silso/datafiles',... | [{'url': 'https://www.re3data.org/repository/r... | [biodbcore-001740, bsg-d001740] | Database | repository | [Electromagnetism, Astrophysics and Astronomy,... | [Climate, Observation design] | [Not applicable] | [Climate change, earth observation, Electromag... | [Belgium] | FAIRsharing record for: WDC Sunspot Index and ... | WDC-SILSO | https://fairsharing.org/10.25504/FAIRsharing.d... | 10.25504/FAIRsharing.d6423b | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: The WDC-SIL... | [] | [{'licence-name': 'SILSO legal notices', 'lice... | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_3226 |
1 | 2114 | fairsharing-records | 2014-11-04T15:23:40.000Z | 2022-01-21T14:39:02.195Z | 10.25504/FAIRsharing.p06nme | Biological Magnetic Resonance Data Bank | ready | [{'contact-name': 'Helpdesk', 'contact-email':... | https://bmrb.io/ | 2114 | BMRB collects, annotates, archives, and dissem... | BMRB | [{'url': 'https://bmrb.io/bmrb/news/', 'name':... | 1988.0 | [{'url': 'https://bmrb.io/data_library/rsync.s... | [{'url': 'https://www.re3data.org/repository/r... | [biodbcore-000584, bsg-d000584] | Database | repository | [Structural Biology] | [Molecular structure, Protein structure, Pepti... | [All] | [] | [United States] | FAIRsharing record for: Biological Magnetic Re... | BMRB | https://fairsharing.org/10.25504/FAIRsharing.p... | 10.25504/FAIRsharing.p06nme | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: BMRB collec... | [{'id': 552, 'pubmed_id': 18288446, 'title': '... | [{'licence-name': 'wwPDB Privacy and Usage Pol... | None | [{'doi': '10.1093/nar/gkm957', 'pubmed-id': 17... | [{'url': 'https://bmrb.io/validate/', 'name': ... | open | yes | https://bmrb.io/deposit/ | open | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_2114 | |
2 | 3022 | fairsharing-records | 2020-06-17T10:25:30.000Z | 2022-02-08T10:41:04.073Z | 10.25504/FAIRsharing.8b7a2f | Fisheries and Oceans Canada Pacific Region Dat... | ready | [{'contact-name': 'Peter Chandler', 'contact-e... | http://www.pac.dfo-mpo.gc.ca/science/oceans/da... | 3022 | The Institute of Ocean Sciences (IOS)/Ocean Sc... | None | [{'url': 'DFO.PAC.SCI.IOSData-DonneesISO.SCI.P... | NaN | [{'name': 'Users must contact the Senior Analy... | [{'url': 'https://www.re3data.org/repository/r... | [biodbcore-001530, bsg-d001530] | Database | repository | [Environmental Science, Meteorology, Earth Sci... | [Climate] | [Not applicable] | [Salinity, Temperature] | [Canada] | FAIRsharing record for: Fisheries and Oceans C... | None | https://fairsharing.org/10.25504/FAIRsharing.8... | 10.25504/FAIRsharing.8b7a2f | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: The Institu... | [] | [{'licence-name': 'Fisheries and Oceans Canada... | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_3022 |
3 | 2998 | fairsharing-records | 2020-05-21T07:42:30.000Z | 2022-02-08T10:40:19.531Z | 10.25504/FAIRsharing.e08886 | Climate Prediction Center | ready | [{'contact-name': 'Jon Hoopingarner', 'contact... | https://www.cpc.ncep.noaa.gov/ | 2998 | The Climate Prediction Center (CPC) produces o... | CPC | [{'url': 'https://www.cpc.ncep.noaa.gov/commen... | 1970.0 | [{'url': 'https://www.cpc.ncep.noaa.gov/', 'na... | [{'url': 'https://www.re3data.org/repository/r... | [biodbcore-001504, bsg-d001504] | Database | repository | [Hydrogeology, Geography, Meteorology, Geodesy... | [Climate] | [Not applicable] | [Forecasting, weather] | [United States] | FAIRsharing record for: Climate Prediction Center | CPC | https://fairsharing.org/10.25504/FAIRsharing.e... | 10.25504/FAIRsharing.e08886 | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: The Climate... | [] | [{'licence-name': 'National Weather Service Di... | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_2998 |
4 | 2301 | fairsharing-records | 2016-06-03T14:54:08.000Z | 2021-11-24T13:17:51.201Z | 10.25504/FAIRsharing.meh9wz | Acytostelium Gene Database | deprecated | [{'contact-name': 'Acytostelium genome consort... | http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... | 2301 | Genome and transcriptome database of Acytostel... | NaN | NaN | 2008.0 | NaN | NaN | [biodbcore-000775, bsg-d000775] | Database | repository | [Genomics, Life Science, Transcriptomics] | [DNA sequence data, Gene model annotation] | [Acytostelium subglobosum] | [] | [United Kingdom, Japan] | FAIRsharing record for: Acytostelium Gene Data... | None | https://fairsharing.org/10.25504/FAIRsharing.m... | 10.25504/FAIRsharing.meh9wz | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: Genome and ... | [{'id': 1139, 'pubmed_id': 25758444, 'title': ... | [] | None | NaN | NaN | This resource is no longer available at the st... | NaN | NaN | NaN | NaN | 2021-9-17 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_2301 |
In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additionalName': ast.literal_eval,
'repositoryIdentifier': ast.literal_eval,
'type': ast.literal_eval,
'contentType': ast.literal_eval,
'providerType': ast.literal_eval,
'institution': ast.literal_eval
})
re3data_df['unique_id'] = 're3data_' + re3data_df.orgIdentifier
re3data_df = re3data_df.add_prefix('re3data_')
re3data_df.head()
Out[3]:
re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | r3d100000001 | Odum Institute Archive Dataverse | eng | [] | https://dataverse.unc.edu/dataverse/odum | [] | ["https://dataverse.unc.edu/dataverse/odum#", ... | The Odum Institute Archive Dataverse contains ... | eng | [disciplinary] | {"size": "13 dataverses; 3.050 datasets", "upd... | NaN | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | NaN | [{'name': 'Databases', 'scheme': 'parse'}, {'n... | [dataProvider] | [FAIR, Middle East, crime, demography, economy... | [{'institutionName': 'Odum Institute for Resea... | [{"policyName": "Collection Development Policy... | {"databaseAccessType": "open", "databaseAcces... | [{"databaseLicenseName": "CC0", "databaseLicen... | [{"dataAccessType": "embargoed", "dataAccessRe... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [] | ["DataVerse"] | NaN | {} | ["DOI"] | NaN | [] | unknown | yes | ["other"] | [{"metadataStandardName": "DDI - Data Document... | {} | Odum Dataverse is covered by Thomson Reuters D... | 2013-06-10 | 2021-07-06 | re3data_r3d100000001 |
1 | r3d100000002 | Access to Archival Databases | eng | [{'additionalName': 'AAD', 'additionalNameLang... | https://aad.archives.gov/aad/ | [RRID:SCR_010479, RRID:nlx_157752] | ["https://www.archives.gov/contact"] | You will find in the Access to Archival Databa... | eng | [disciplinary] | {"size": "", "updatedp": ""} | 1985 | NaN | ["eng", "spa"] | [{'name': '1 Humanities and Social Sciences', ... | https://www.archives.gov/publications/general-... | [{'name': 'Images', 'scheme': 'parse'}, {'name... | [dataProvider] | [US History] | [{'institutionName': 'The U.S. National Archiv... | [{"policyName": "Contribution Policy", "policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "Copyrights", "dataLicens... | restricted | [] | ["unknown"] | no | {"api": "https://www.archives.gov/developer#to... | ["none"] | https://aad.archives.gov/aad/help/getting-star... | [] | unknown | unknown | [] | [] | {"syndication": "http://www.archives.gov/socia... | NaN | 2012-07-04 | 2021-05-25 | re3data_r3d100000002 |
2 | r3d100000004 | Datenbank Gesprochenes Deutsch | deu | [{'additionalName': 'DGD', 'additionalNameLang... | https://dgd.ids-mannheim.de/ | [] | ["dgd@ids-mannheim.de"] | The "Database for Spoken German (DGD)" is a co... | eng | [disciplinary] | {"size": "34 corpora", "updatedp": "2020-02-03"} | 2012 | NaN | ["deu"] | [{'name': '1 Humanities and Social Sciences', ... | https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... | [{'name': 'Audiovisual data', 'scheme': 'parse... | [dataProvider, serviceProvider] | [Australian German, FOLK, German dialects, Pfe... | [{'institutionName': 'Institut für Deutsche Sp... | [{"policyName": "Erfurter Aufruf zur Sicherung... | {"databaseAccessType": "restricted", "databas... | [] | [{"dataAccessType": "restricted", "dataAccessR... | [{"dataLicenseName": "other", "dataLicenseURL"... | restricted | [] | ["other"] | yes | {} | ["none"] | http://agd.ids-mannheim.de/konditionen.shtml | [] | unknown | unknown | ["RatSWD"] | [] | {} | NaN | 2012-07-20 | 2020-08-27 | re3data_r3d100000004 |
3 | r3d100000005 | UNC Dataverse | eng | [{'additionalName': 'University of North Carol... | https://dataverse.unc.edu/ | [FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c] | ["https://dataverse.unc.edu/", "odumarchive@un... | UNC Dataverse is an open-source repository sof... | eng | [institutional] | {"size": "186 dataverses; 25.272 studies; 229.... | 2011 | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | https://odum.unc.edu/about/mission-vision/ | [{'name': 'Archived data', 'scheme': 'parse'},... | [dataProvider, serviceProvider] | [FAIR, census, demographic survey, demography,... | [{'institutionName': 'Odum Institute for Resea... | [{"policyName": "Collection Development Policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Data Deposit Form"... | ["DataVerse"] | yes | {"api": "https://guides.dataverse.org/en/lates... | ["ARK", "DOI", "PURL", "URN", "hdl"] | https://dataverse.org/best-practices/data-cita... | [] | unknown | yes | [] | [{"metadataStandardName": "DDI - Data Document... | {} | UNC Dataverse is covered by Clarivate Data Cit... | 2012-07-23 | 2021-10-25 | re3data_r3d100000005 |
4 | r3d100000006 | Archaeology Data Service | eng | [{'additionalName': 'ADS', 'additionalNameLang... | https://archaeologydataservice.ac.uk/ | [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] | ["help@archaeologydataservice.ac.uk", "https:/... | The ADS is an accredited digital repository fo... | eng | [disciplinary] | {"size": "1837 results", "updatedp": "2020-05-... | 1996-10-01 | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | https://archaeologydataservice.ac.uk/about/our... | [{'name': 'Archived data', 'scheme': 'parse'},... | [dataProvider, serviceProvider] | [FAIR, archaeology, cultural heritage, prehist... | [{'institutionName': 'Arts and Humanities Rese... | [{"policyName": "ADS Guides to good practice",... | {"databaseAccessType": "open", "databaseAcces... | [{"databaseLicenseName": "CC", "databaseLicens... | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Guidelines for Dep... | ["other"] | yes | {"api": "https://archaeologydataservice.ac.uk/... | ["DOI"] | https://archaeologydataservice.ac.uk/advice/te... | [] | unknown | yes | ["other"] | [{"metadataStandardName": "DataCite Metadata S... | {"syndication": "https://archaeologydataservic... | ADS is covered by Clarivate Data Citation Inde... | 2012-07-23 | 2021-09-02 | re3data_r3d100000006 |
In [4]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
converters={'repository_metadata.content_subjects': ast.literal_eval,
'repository_metadata.alternativename': ast.literal_eval,
'repository_metadata.content_types': ast.literal_eval,
'organization': ast.literal_eval
},
dtype={'system_metadata.id': str})
opendoar_df['unique_id'] = 'OpenDOAR_' + opendoar_df['system_metadata.id']
opendoar_df = opendoar_df.add_prefix('OpenDOAR_')
opendoar_df.head()
Out[4]:
OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_repository_metadata.repository_status | OpenDOAR_repository_metadata.fulltext_record_count | OpenDOAR_repository_metadata.metadata_record_count | OpenDOAR_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 134 | {"name": "eldorado - repository of the tu dort... | [{'name': 'eldorado - ressourcen aus und für l... | https://eldorado.tu-dortmund.de | NaN | institutional | [] | 2022-01-12 15:34:54 | 2005-12-19 14:57:52 | [arts, humanities, science, mathematics, socia... | [journal_articles, conference_and_workshop_pap... | [{'name': 'technische universität dortmund', '... | [] | {"name": "dspace", "version": ""} | https://eldorado.tu-dortmund.de/oai/request | yes | NaN | 9629.0 | 20963.0 | OpenDOAR_134 |
1 | 58 | {"name": "archive ouverte en sciences de linfo... | [{'acronym': '@rchivesic'}] | https://archivesic.ccsd.cnrs.fr | NaN | institutional | [] | 2022-01-12 15:34:53 | 2006-01-13 12:48:32 | [arts, science, technology, engineering, mathe... | [journal_articles, conference_and_workshop_pap... | [{'name': 'centre pour la communication scient... | [] | {"name": "hal", "version": ""} | https://api.archives-ouvertes.fr/oai/archivesic | yes | NaN | 55492.0 | 1137498.0 | OpenDOAR_58 |
2 | 93 | {"name": "digitalcommons@the texas medical cen... | [] | http://digitalcommons.library.tmc.edu/ | NaN | institutional | [] | 2022-01-12 15:34:53 | 2006-02-14 11:16:12 | [health and medicine] | [journal_articles, theses_and_dissertations] | [{'name': 'texas medical center', 'alternative... | [] | {"name": "other", "version": ""} | http://digitalcommons.library.tmc.edu/do/oai/ | yes | NaN | 2658.0 | 7268.0 | OpenDOAR_93 |
3 | 68 | {"name": "cognitive sciences eprint archive", ... | [{'acronym': 'cogprints'}] | http://cogprints.org/ | NaN | disciplinary | [] | 2022-01-12 15:34:53 | 2006-01-04 15:01:23 | [humanities, health and medicine, science, soc... | [journal_articles, conference_and_workshop_pap... | [{'name': 'university of southampton', 'altern... | [] | {"name": "eprints", "version": ""} | http://cogprints.org/cgi/oai2 | yes | NaN | 2895.0 | 4277.0 | OpenDOAR_68 |
4 | 84 | {"name": "digital commons@carleton college", "... | [] | http://digitalcommons.carleton.edu/ | NaN | institutional | [] | 2022-01-12 15:34:53 | 2006-01-04 16:07:58 | [humanities, science, social sciences] | [journal_articles, unpub_reports_and_working_p... | [{'name': 'carleton college', 'alternativeName... | [] | {"name": "other", "version": ""} | NaN | yes | NaN | NaN | 42.0 | OpenDOAR_84 |
In [5]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)
def value_or_list(cell_set):
copy = set(cell_set)
copy.discard(np.nan)
if len(copy) == 0:
return np.nan
if len(copy) == 1:
return copy.pop()
return list(copy)
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)
roar_df['unique_id'] = 'roar_' + roar_df.eprintid
roar_df = roar_df.add_prefix('roar_')
roar_df.head()
Out[5]:
roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 633 | archive | 1 | NaN | NaN | disk0/00/00/00/01 | 2010-01-06 13:43:48 | 2011-07-18 05:40:07 | 2010-01-06 13:43:48 | subject | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://archivesic.ccsd.cnrs.fr/ | @RCHIVESIC | http://archivesic.ccsd.cnrs.fr/oai/oai.php | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | fr | NaN | NaN | NaN | hal | geoname_2_FR | other | NaN | 2002-05-17 19:24:41 | NaN | NaN | 0 | 0 | 0 | 25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... | NaN | NaN | NaN | NaN | [celestial, opendoar] | [58, 669] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_1 |
1 | 10 | 511 | archive | 1 | NaN | NaN | disk0/00/00/00/10 | 2010-01-06 13:43:48 | 2011-07-18 05:40:13 | 2010-01-06 13:43:48 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://www.diva-portal.org/mdh/ | Academic Archive On-line (Mälardalen Universit... | http://www.diva-portal.org/oai/mdh/OAI | NaN | NaN | NaN | NaN | TRUE | TRUE | NaN | NaN | NaN | se | Uppsala | 59.8667 | 17.6333 | diva | geoname_2_SE | other | NaN | 2005-12-08 13:15:22 | NaN | NaN | 0 | 0 | 0 | 100 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,100... | NaN | NaN | NaN | NaN | [celestial, opendoar] | [526, 258] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_10 |
2 | 1000 | 274 | archive | 1 | NaN | NaN | disk0/00/00/10/00 | 2010-01-06 13:45:01 | 2011-07-06 08:21:21 | 2010-01-06 13:45:01 | subject | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://pam.pisharp.org/ | PAM - Portuguese Archive of Mathematics | NaN | NaN | NaN | NaN | NaN | TRUE | TRUE | NaN | NaN | NaN | pt | Bellevue, WA | 47.6034 | -122.155 | dspace | geoname_2_PT | other | NaN | 2006-05-04 10:48:14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_1000 |
3 | 10001 | 20 | archive | 91 | NaN | NaN | disk0/00/01/00/01 | 2015-08-08 14:52:11 | 2016-03-21 19:44:01 | 2015-08-08 14:52:11 | subject | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://edoc.sub.uni-hamburg.de/klimawandel/ | Klimawandel Dokumentenserver | http://edoc.sub.uni-hamburg.de/klimawandel/oai | NaN | NaN | NaN | The "Documentenserver Klimawandel" (Repository... | TRUE | TRUE | TRUE | [Climate Service Center 2.0, Helmholtz-Zentrum... | [http://www.klimzug.de/de/94.php, http://www.c... | de | Hamburg | 53.5511 | 9.9937 | opus | geoname_2_DE | other | [HD, S1, GF, GE, G1] | 2015-07-02 08:08:31 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [celestial, opendoar] | [3408, 5881] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_10001 |
4 | 10008 | 11 | archive | 404 | NaN | NaN | disk0/00/01/00/08 | 2015-08-08 14:52:26 | 2016-03-21 19:43:51 | 2015-08-08 14:52:26 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://creativematter.skidmore.edu/ | Creative Matter | Skidmore College Research | http://creativematter.skidmore.edu/do/oai/ | NaN | http://creativematter.skidmore.edu/recent.rss | NaN | Welcome to Creative Matter, a repository for t... | TRUE | FALSE | FALSE | Skidmore College | http://www.skidmore.edu/ | us | Saratoga Springs | 43.0961 | -73.7818 | bepress | geoname_2_US | other | NaN | 2015-07-06 17:35:50 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | celestial | 5882 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_10008 |
In [6]:
roar_df[roar_df.roar_eprintid == '10013']
Out[6]:
roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | 10013 | 31 | archive | 7104 | NaN | NaN | disk0/00/01/00/13 | 2015-08-08 14:53:04 | 2016-03-21 19:54:43 | 2015-08-08 14:53:04 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://er.ucu.edu.ua/ | ErUCU: Electronic repository of the Ukrainian ... | http://er.ucu.edu.ua/oai/request | http://er.ucu.edu.ua/sword/ | http://er.ucu.edu.ua/feed/rss_2.0/site | NaN | Ukrainian Catholic University’s institutional ... | TRUE | TRUE | TRUE | Ukrainian Catholic University | http://ucu.edu.ua/eng/ | ua | Lviv | NaN | NaN | dspace | geoname_2_UA | other | [D1, DK, BL, BR, L1, BS, D901, B1, AC, BF, HM,... | 2015-07-07 12:38:37 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [celestial, opendoar] | [5883, 3410] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [russell_group, ivy_league] | roar_10013 |
Loading dedup results¶
In [7]:
dup = pd.read_csv('../data/processed/ds_dedup_2022-02-16_13.03.17.csv',
sep=';', quotechar='"',
header=0, names=['dedup_id', 'duplicate_id', 'original_id', 'name', 'source'])
dup['unique_id'] = dup.source + '_' + dup.original_id
dup.head()
Out[7]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
0 | dedup::001e6d882e54c780ce269d3c46997287 | https://fairsharing.org/10.25504/FAIRsharing.q... | 2094 | RESID Database of Protein Modifications | FAIRsharing | FAIRsharing_2094 |
1 | dedup::001e6d882e54c780ce269d3c46997287 | re3data::r3d100011306 | r3d100011306 | RESID Database of Protein Modifications | re3data | re3data_r3d100011306 |
2 | dedup::003ab6b40af9b488decea7c582d150a2 | re3data::r3d100011894 | r3d100011894 | Synapse | re3data | re3data_r3d100011894 |
3 | dedup::003ab6b40af9b488decea7c582d150a2 | https://fairsharing.org/10.25504/FAIRsharing.d... | 2315 | Synapse | FAIRsharing | FAIRsharing_2315 |
4 | dedup::0048f2e3aa55ab88aaaac0cfa4153ad5 | opendoar::4562 | 4562 | erzincan binali yıldırım university institutio... | OpenDOAR | OpenDOAR_4562 |
In [8]:
dup.describe()
Out[8]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
count | 4712 | 4712 | 4712 | 4712 | 4712 | 4712 |
unique | 2239 | 4712 | 4238 | 4017 | 4 | 4712 |
top | dedup::67c12a6c3288a49f1db6a2343ec599ca | https://fairsharing.org/10.25504/FAIRsharing.q... | 3284 | UPN JATIM REPOSITORY | roar | FAIRsharing_2094 |
freq | 5 | 1 | 3 | 4 | 1981 | 1 |
Assessing duplicates distribution across registries¶
In [9]:
dup_grouped = dup.groupby('dedup_id').aggregate(list)
dup_grouped['source_set'] = dup_grouped.source.map(set)
In [10]:
dup_grouped[dup_grouped.source_set.str.len() == 4].count()
Out[10]:
duplicate_id 6 original_id 6 name 6 source 6 unique_id 6 source_set 6 dtype: int64
In [11]:
dup_grouped[dup_grouped.source_set.str.len() == 3].count()
Out[11]:
duplicate_id 61 original_id 61 name 61 source 61 unique_id 61 source_set 61 dtype: int64
In [12]:
dup_grouped[dup_grouped.source_set.str.len() == 2].count()
Out[12]:
duplicate_id 2029 original_id 2029 name 2029 source 2029 unique_id 2029 source_set 2029 dtype: int64
In [13]:
dup_grouped[dup_grouped.source_set.str.len() == 1].count()
Out[13]:
duplicate_id 143 original_id 143 name 143 source 143 unique_id 143 source_set 143 dtype: int64
Assessing duplicates within registries¶
In [14]:
roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()
roar_dup = roar_dup[roar_dup.duplicate_id > 1]
roar_dup.aggregate(['count', 'sum'])
Out[14]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
count | 249 | 249 | 249 | 249 | 249 |
sum | 518 | 518 | 518 | 518 | 518 |
In [15]:
opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()
opendoar_dup = opendoar_dup[opendoar_dup.duplicate_id > 1]
opendoar_dup.aggregate(['count', 'sum'])
Out[15]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
count | 30 | 30 | 30 | 30 | 30 |
sum | 62 | 62 | 62 | 62 | 62 |
In [16]:
re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()
re3data_dup = re3data_dup[re3data_dup.duplicate_id > 1]
re3data_dup.aggregate(['count', 'sum'])
Out[16]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
count | 3 | 3 | 3 | 3 | 3 |
sum | 6 | 6 | 6 | 6 | 6 |
In [17]:
fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()
fairsharing_dup = fairsharing_dup[fairsharing_dup.duplicate_id > 1]
fairsharing_dup.aggregate(['count', 'sum'])
Out[17]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
count | 0 | 0 | 0 | 0 | 0 |
sum | 0 | 0 | 0 | 0 | 0 |
There is one record which is counted twice. All ok: 2 records in roar and 2 in opendoar ['OpenDOAR_5226', 'roar_14929', 'OpenDOAR_3820', 'roar_16263']
In [18]:
np.intersect1d(roar_dup.index, opendoar_dup.index)
Out[18]:
array(['dedup::6973375bbb56846f0d935bd1cd9e0b98'], dtype=object)
In [19]:
dup[dup.dedup_id == 'dedup::6973375bbb56846f0d935bd1cd9e0b98']
Out[19]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
1937 | dedup::6973375bbb56846f0d935bd1cd9e0b98 | opendoar::3820 | 3820 | repositorio - universidad de la costa | OpenDOAR | OpenDOAR_3820 |
1938 | dedup::6973375bbb56846f0d935bd1cd9e0b98 | opendoar::5226 | 5226 | repositorio universidad de la costa | OpenDOAR | OpenDOAR_5226 |
1939 | dedup::6973375bbb56846f0d935bd1cd9e0b98 | roar::14929 | 14929 | Repositorio Universidad de la Costa | roar | roar_14929 |
1940 | dedup::6973375bbb56846f0d935bd1cd9e0b98 | roar::16263 | 16263 | Repositorio Universidad de la Costa | roar | roar_16263 |
Isolating single-registry duplicates¶
In [20]:
dup_within = dup.groupby('dedup_id').aggregate(list)
dup_within['source_set'] = dup_within.source.map(set)
dup_within = dup_within[dup_within.source_set.str.len() == 1]
dup_within.head()
Out[20]:
duplicate_id | original_id | name | source | unique_id | source_set | |
---|---|---|---|---|---|---|
dedup_id | ||||||
dedup::07b65089515c8f99812d14bbb01334a6 | [roar::474, roar::5541] | [474, 5541] | [ECNIS Repository (Environmental Cancer Risk, ... | [roar, roar] | [roar_474, roar_5541] | {roar} |
dedup::0be44aa69610e09805d4002baf7e0b10 | [roar::16867, roar::2907] | [16867, 2907] | [Chung Shan Medical University Institutional R... | [roar, roar] | [roar_16867, roar_2907] | {roar} |
dedup::0c34770edc42a1d2ac361b64cfabfb63 | [roar::5432, roar::4030] | [5432, 4030] | [Digital Library of Jelenia Góra, Digital Libr... | [roar, roar] | [roar_5432, roar_4030] | {roar} |
dedup::0c6ed4b110c461d9350bf5c620bc78d7 | [roar::3020, roar::3401, roar::5252] | [3020, 3401, 5252] | [KCE Repository, KCE Repository, KCE Repository] | [roar, roar, roar] | [roar_3020, roar_3401, roar_5252] | {roar} |
dedup::0e3c63baca694032044bbb00c2f1111e | [roar::8405, roar::8716] | [8405, 8716] | [Content Pro IRX, Content Pro IRX] | [roar, roar] | [roar_8405, roar_8716] | {roar} |
In [21]:
dup_within['source_set'] = dup_within.source_set.map(set.pop)
dup_within.head()
Out[21]:
duplicate_id | original_id | name | source | unique_id | source_set | |
---|---|---|---|---|---|---|
dedup_id | ||||||
dedup::07b65089515c8f99812d14bbb01334a6 | [roar::474, roar::5541] | [474, 5541] | [ECNIS Repository (Environmental Cancer Risk, ... | [roar, roar] | [roar_474, roar_5541] | roar |
dedup::0be44aa69610e09805d4002baf7e0b10 | [roar::16867, roar::2907] | [16867, 2907] | [Chung Shan Medical University Institutional R... | [roar, roar] | [roar_16867, roar_2907] | roar |
dedup::0c34770edc42a1d2ac361b64cfabfb63 | [roar::5432, roar::4030] | [5432, 4030] | [Digital Library of Jelenia Góra, Digital Libr... | [roar, roar] | [roar_5432, roar_4030] | roar |
dedup::0c6ed4b110c461d9350bf5c620bc78d7 | [roar::3020, roar::3401, roar::5252] | [3020, 3401, 5252] | [KCE Repository, KCE Repository, KCE Repository] | [roar, roar, roar] | [roar_3020, roar_3401, roar_5252] | roar |
dedup::0e3c63baca694032044bbb00c2f1111e | [roar::8405, roar::8716] | [8405, 8716] | [Content Pro IRX, Content Pro IRX] | [roar, roar] | [roar_8405, roar_8716] | roar |
In [22]:
dup_within.groupby('dedup_id').ngroups
Out[22]:
143
In [23]:
dup_within.groupby('source_set').count()
Out[23]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
source_set | |||||
OpenDOAR | 18 | 18 | 18 | 18 | 18 |
re3data | 2 | 2 | 2 | 2 | 2 |
roar | 123 | 123 | 123 | 123 | 123 |
In [24]:
dup_within = dup[dup.dedup_id.isin(dup_within.index)]
dup_within
Out[24]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
122 | dedup::07b65089515c8f99812d14bbb01334a6 | roar::474 | 474 | ECNIS Repository (Environmental Cancer Risk | roar | roar_474 |
123 | dedup::07b65089515c8f99812d14bbb01334a6 | roar::5541 | 5541 | ECNIS Repository (Environmental Cancer Risk | roar | roar_5541 |
184 | dedup::0be44aa69610e09805d4002baf7e0b10 | roar::16867 | 16867 | Chung Shan Medical University Institutional Re... | roar | roar_16867 |
185 | dedup::0be44aa69610e09805d4002baf7e0b10 | roar::2907 | 2907 | Chung Shan Medical University Institutional Re... | roar | roar_2907 |
192 | dedup::0c34770edc42a1d2ac361b64cfabfb63 | roar::5432 | 5432 | Digital Library of Jelenia Góra | roar | roar_5432 |
... | ... | ... | ... | ... | ... | ... |
4583 | dedup::f9293f212c2f13c7cc7a2d2a967ac7d5 | roar::13134 | 13134 | Repositorio Universidad de Sucre | roar | roar_13134 |
4608 | dedup::fab2415bf42ac76e4ae00aa68b61a4ba | roar::5482 | 5482 | Biblioteca Virtual del Centro de Documentación | roar | roar_5482 |
4609 | dedup::fab2415bf42ac76e4ae00aa68b61a4ba | roar::5214 | 5214 | Biblioteca Virtual del Centro de Documentación | roar | roar_5214 |
4690 | dedup::fee4180dcb5f2af4d963b6d74d82d8c2 | roar::3992 | 3992 | York St John University ArchivalWare Digital L... | roar | roar_3992 |
4691 | dedup::fee4180dcb5f2af4d963b6d74d82d8c2 | roar::5185 | 5185 | York St John University ArchivalWare Digital L... | roar | roar_5185 |
296 rows × 6 columns
Isolating hybrid duplicates¶
In [25]:
dup_across = dup[~dup.dedup_id.isin(dup_within.dedup_id)]
dup_across = dup_across.groupby('dedup_id').aggregate(list)
dup_across['source_set'] = dup_across.source.map(set)
dup_hybrid = dup_across[dup_across.source_set.str.len() < dup_across.source.str.len()]
dup_hybrid = dup[dup.dedup_id.isin(dup_hybrid.index)]
dup_hybrid
Out[25]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
53 | dedup::038ef33e8d3de51d3536d62e6c103be7 | roar::6167 | 6167 | Institutional Repository UIN Syarif Hidayatull... | roar | roar_6167 |
54 | dedup::038ef33e8d3de51d3536d62e6c103be7 | opendoar::2717 | 2717 | institutional repository uin syarif hidayatull... | OpenDOAR | OpenDOAR_2717 |
55 | dedup::038ef33e8d3de51d3536d62e6c103be7 | roar::6580 | 6580 | Institutional Repository UIN Syarif Hidayatull... | roar | roar_6580 |
72 | dedup::044edcd1c961b3942a7e0e90d1005e2d | roar::7902 | 7902 | The University of Arizona Campus Repository | roar | roar_7902 |
73 | dedup::044edcd1c961b3942a7e0e90d1005e2d | opendoar::2468 | 2468 | university of arizona campus repository | OpenDOAR | OpenDOAR_2468 |
... | ... | ... | ... | ... | ... | ... |
4596 | dedup::fa0721f07402e0593da77a46fa687da6 | opendoar::2545 | 2545 | sanok digital library | OpenDOAR | OpenDOAR_2545 |
4597 | dedup::fa0721f07402e0593da77a46fa687da6 | roar::5746 | 5746 | Sanok Digital Library | roar | roar_5746 |
4610 | dedup::fab888b1713fb886b13bbd2d569bba60 | opendoar::2539 | 2539 | publication server of the wuppertal institute | OpenDOAR | OpenDOAR_2539 |
4611 | dedup::fab888b1713fb886b13bbd2d569bba60 | roar::11212 | 11212 | Publication Server of the Wuppertal Institute | roar | roar_11212 |
4612 | dedup::fab888b1713fb886b13bbd2d569bba60 | roar::5891 | 5891 | Publication Server of the Wuppertal Institute | roar | roar_5891 |
434 rows × 6 columns
In [26]:
dup_hybrid.groupby('dedup_id').ngroups
Out[26]:
138
Isolating multiple-registry duplicates¶
In [27]:
dup_across = dup_across[dup_across.source_set.str.len() == dup_across.source.str.len()]
dup_across = dup[dup.dedup_id.isin(dup_across.index)]
dup_across
# dup[dup.dedup_id.isin(dup_across.index)]
Out[27]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
0 | dedup::001e6d882e54c780ce269d3c46997287 | https://fairsharing.org/10.25504/FAIRsharing.q... | 2094 | RESID Database of Protein Modifications | FAIRsharing | FAIRsharing_2094 |
1 | dedup::001e6d882e54c780ce269d3c46997287 | re3data::r3d100011306 | r3d100011306 | RESID Database of Protein Modifications | re3data | re3data_r3d100011306 |
2 | dedup::003ab6b40af9b488decea7c582d150a2 | re3data::r3d100011894 | r3d100011894 | Synapse | re3data | re3data_r3d100011894 |
3 | dedup::003ab6b40af9b488decea7c582d150a2 | https://fairsharing.org/10.25504/FAIRsharing.d... | 2315 | Synapse | FAIRsharing | FAIRsharing_2315 |
4 | dedup::0048f2e3aa55ab88aaaac0cfa4153ad5 | opendoar::4562 | 4562 | erzincan binali yıldırım university institutio... | OpenDOAR | OpenDOAR_4562 |
... | ... | ... | ... | ... | ... | ... |
4707 | dedup::ff7d2ea87cebddb182db2fb8cf32aa89 | opendoar::2126 | 2126 | sophia | OpenDOAR | OpenDOAR_2126 |
4708 | dedup::ffb342887a73ec0ead022e0414d765b1 | roar::668 | 668 | Infoscience: École polytechnique fédérale de L... | roar | roar_668 |
4709 | dedup::ffb342887a73ec0ead022e0414d765b1 | opendoar::185 | 185 | infoscience - école polytechnique fédérale de ... | OpenDOAR | OpenDOAR_185 |
4710 | dedup::ffbb6800107747f9224cdde0df95da7c | opendoar::3122 | 3122 | istanbul bilgi university library open access | OpenDOAR | OpenDOAR_3122 |
4711 | dedup::ffbb6800107747f9224cdde0df95da7c | roar::13646 | 13646 | Istanbul Bilgi University Library Open Access | roar | roar_13646 |
3982 rows × 6 columns
In [28]:
dup_across.groupby('dedup_id').ngroups
Out[28]:
1958
Double check partitions
In [29]:
dup.count()
Out[29]:
dedup_id 4712 duplicate_id 4712 original_id 4712 name 4712 source 4712 unique_id 4712 dtype: int64
In [30]:
dup_across.count() + dup_within.count() + dup_hybrid.count()
Out[30]:
dedup_id 4712 duplicate_id 4712 original_id 4712 name 4712 source 4712 unique_id 4712 dtype: int64
In [31]:
dup_within.groupby('dedup_id').ngroups + dup_across.groupby('dedup_id').ngroups + dup_hybrid.groupby('dedup_id').ngroups
Out[31]:
2239
In [32]:
dup.groupby('dedup_id').ngroups
Out[32]:
2239
Joining information¶
In [33]:
dup_within = dup_within.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_within = dup_within.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_within = dup_within.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_within = dup_within.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_within.head()
Out[33]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.metadata.cross-references | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.url-for-logo | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.data-access-condition.type | FAIRsharing_attributes.metadata.data-contact-information | FAIRsharing_attributes.metadata.data-deposition-condition.url | FAIRsharing_attributes.metadata.data-deposition-condition.type | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.data-access-condition.url | FAIRsharing_attributes.metadata.resource-sustainability.url | FAIRsharing_attributes.metadata.resource-sustainability.name | FAIRsharing_attributes.metadata.data-preservation-policy.url | FAIRsharing_attributes.metadata.data-preservation-policy.name | FAIRsharing_attributes.metadata.data-access-for-pre-publication-review | FAIRsharing_attributes.metadata.data-versioning | FAIRsharing_attributes.metadata.data-curation.type | FAIRsharing_attributes.metadata.data-curation.url | FAIRsharing_attributes.metadata.citation-to-related-publications | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_repository_metadata.repository_status | OpenDOAR_repository_metadata.fulltext_record_count | OpenDOAR_repository_metadata.metadata_record_count | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::07b65089515c8f99812d14bbb01334a6 | roar::474 | 474 | ECNIS Repository (Environmental Cancer Risk | roar | roar_474 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 474 | 281 | archive | 1 | NaN | NaN | disk0/00/00/04/74 | 2010-01-06 13:44:22 | 2011-07-06 08:19:53 | 2010-01-06 13:44:22 | other | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://ecnis.openrepository.com/ | ECNIS Repository (Environmental Cancer Risk, N... | NaN | NaN | NaN | NaN | This site is a subject specific repository con... | TRUE | TRUE | NaN | ECNIS (Environmental Cancer Risk, Nutrition an... | http://www.ecnis.org | pl | Lodz | 51.8 | 19.5 | openrepo | geoname_2_PL | other | NaN | 2008-06-03 08:05:43 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | opendoar | 1254 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_474 |
1 | dedup::07b65089515c8f99812d14bbb01334a6 | roar::5541 | 5541 | ECNIS Repository (Environmental Cancer Risk | roar | roar_5541 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5541 | 8 | archive | 8 | NaN | NaN | disk0/00/00/55/41 | 2012-12-12 01:21:03 | 2012-12-15 02:51:35 | 2012-12-12 01:21:03 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://ecnis.openrepository.com/ecnis/ | ECNIS Repository (Environmental Cancer Risk, N... | NaN | NaN | NaN | NaN | This site is a subject specific repository con... | NaN | NaN | NaN | ECNIS Network of Excellence | http://www.ecnis.org/ | pl | NaN | 51.8 | 19.5 | NaN | geoname_2_PL | other | NaN | 2012-07-01 15:13:36 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | opendoar | 1254 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_5541 |
2 | dedup::0be44aa69610e09805d4002baf7e0b10 | roar::16867 | 16867 | Chung Shan Medical University Institutional Re... | roar | roar_16867 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16867 | 3 | archive | 360 | NaN | NaN | disk0/00/01/68/67 | 2021-02-25 13:06:19 | 2021-02-25 13:06:19 | 2021-02-25 13:06:19 | institutional | 2907 | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | https://ir.csmu.edu.tw:8080 | Chung Shan Medical University Institutional Re... | https://ir.csmu.edu.tw:8080/ir-oai/request?ver... | NaN | NaN | NaN | NaN | TRUE | TRUE | FALSE | NaN | NaN | NaN | NaN | NaN | NaN | dspace | NaN | other | [RT, RC0254, RC1200, R1, RK] | 2009-10-21 00:00:00 | NaN | NaN | 0 | 0 | 0 | 100 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28... | NaN | NaN | NaN | NaN | NaN | NaN | celestial | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_16867 |
3 | dedup::0be44aa69610e09805d4002baf7e0b10 | roar::2907 | 2907 | Chung Shan Medical University Institutional Re... | roar | roar_2907 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2907 | 548 | archive | 360 | NaN | NaN | disk0/00/00/29/07 | 2010-07-29 01:40:55 | 2021-02-17 06:33:34 | 2010-07-29 01:40:55 | institutional | NaN | NaN | no_search | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | https://ir.csmu.edu.tw:8080 | Chung Shan Medical University Institutional Re... | https://ir.csmu.edu.tw:8080/ir-oai/request?ver... | NaN | NaN | NaN | NaN | TRUE | TRUE | FALSE | NaN | NaN | NaN | NaN | NaN | NaN | dspace | NaN | other | [RC0321, RT, RC0254, RC1200, R1, RK] | 2009-10-21 00:00:00 | NaN | NaN | 0 | 0 | 0 | 100 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28... | NaN | NaN | NaN | NaN | NaN | NaN | celestial | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_2907 |
4 | dedup::0c34770edc42a1d2ac361b64cfabfb63 | roar::5432 | 5432 | Digital Library of Jelenia Góra | roar | roar_5432 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5432 | 9 | archive | 8 | NaN | NaN | disk0/00/00/54/32 | 2012-11-19 20:28:01 | 2012-11-26 06:53:38 | 2012-11-19 20:28:01 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://jbc.jelenia-gora.pl/dlibra.html | Digital Library of Jelenia Góra | http://jbc.jelenia-gora.pl/dlibra/oai-pmh-repo... | NaN | NaN | NaN | Users may set up RSS feeds to be alerted to ne... | NaN | NaN | NaN | Jeleniogórskie Centrum Informacji i Edukacji R... | http://biblioteka.jelenia-gora.pl/ | pl | NaN | 50.9012 | 15.7341 | NaN | geoname_2_PL | other | NaN | 2012-07-01 15:12:22 | NaN | NaN | 0 | 0 | 0 | 20 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19... | NaN | NaN | NaN | NaN | [celestial, opendoar] | [4595, 2211] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_5432 |
In [34]:
dup_hybrid = dup_hybrid.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_hybrid.head()
Out[34]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.metadata.cross-references | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.url-for-logo | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.data-access-condition.type | FAIRsharing_attributes.metadata.data-contact-information | FAIRsharing_attributes.metadata.data-deposition-condition.url | FAIRsharing_attributes.metadata.data-deposition-condition.type | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.data-access-condition.url | FAIRsharing_attributes.metadata.resource-sustainability.url | FAIRsharing_attributes.metadata.resource-sustainability.name | FAIRsharing_attributes.metadata.data-preservation-policy.url | FAIRsharing_attributes.metadata.data-preservation-policy.name | FAIRsharing_attributes.metadata.data-access-for-pre-publication-review | FAIRsharing_attributes.metadata.data-versioning | FAIRsharing_attributes.metadata.data-curation.type | FAIRsharing_attributes.metadata.data-curation.url | FAIRsharing_attributes.metadata.citation-to-related-publications | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_repository_metadata.repository_status | OpenDOAR_repository_metadata.fulltext_record_count | OpenDOAR_repository_metadata.metadata_record_count | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::038ef33e8d3de51d3536d62e6c103be7 | roar::6167 | 6167 | Institutional Repository UIN Syarif Hidayatull... | roar | roar_6167 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6167 | 13 | archive | 2178 | NaN | NaN | disk0/00/00/61/67 | 2012-12-12 05:42:58 | 2013-07-14 15:12:12 | 2012-12-12 05:42:58 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://repository.uinjkt.ac.id | Institutional Repository UIN Syarif Hidayatull... | [http://repository.uinjkt.ac.id/oai, http://re... | http://repository.uinjkt.ac.id/sword/ | NaN | NaN | nstitutional Repository UIN Syarif Hidayatulla... | TRUE | TRUE | FALSE | [UIN Syarif Hidayatullah Jakarta, Pascasarjana... | [http://www.uinjkt.ac.id, http://graduate.uinj... | id | Jakarta | NaN | NaN | dspace | geoname_2_ID | other | AI | 2012-11-07 08:11:19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [celestial, opendoar] | [5108, 2717, 5109] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_6167 |
1 | dedup::038ef33e8d3de51d3536d62e6c103be7 | opendoar::2717 | 2717 | institutional repository uin syarif hidayatull... | OpenDOAR | OpenDOAR_2717 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2717 | {"name": "institutional repository uin syarif ... | [] | http://repository.uinjkt.ac.id/dspace/ | NaN | institutional | [] | 2022-01-12 15:35:36 | 2013-07-11 15:52:01 | [science, arts, humanities, social sciences, h... | [theses_and_dissertations] | [{'name': 'uin syarif hidayatullah jakarta, st... | [] | {"name": "dspace", "version": ""} | http://repository.uinjkt.ac.id/oai/ | yes | NaN | 0.0 | 36862.0 | OpenDOAR_2717 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | dedup::038ef33e8d3de51d3536d62e6c103be7 | roar::6580 | 6580 | Institutional Repository UIN Syarif Hidayatull... | roar | roar_6580 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6580 | 16 | archive | 2040 | NaN | NaN | disk0/00/00/65/80 | 2013-03-31 16:02:51 | 2013-04-06 01:42:14 | 2013-03-31 16:02:51 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://repository.uinjkt.ac.id | Institutional Repository UIN Syarif Hidayatull... | NaN | NaN | NaN | NaN | NaN | FALSE | FALSE | FALSE | NaN | NaN | id | Jakarta | 106.756 | -6.30591 | dspace | geoname_2_ID | other | [AC, Z665, Z004, Z719, BP, Q1] | 2013-03-04 07:20:37 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_6580 |
3 | dedup::044edcd1c961b3942a7e0e90d1005e2d | roar::7902 | 7902 | The University of Arizona Campus Repository | roar | roar_7902 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7902 | 12 | archive | 4910 | NaN | NaN | disk0/00/00/79/02 | 2014-03-05 11:50:29 | 2014-05-08 13:10:29 | 2014-03-05 11:50:29 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://arizona.openrepository.com/arizona/ | The University of Arizona Campus Repository | http://arizona.openrepository.com/arizona/oai/... | NaN | http://arizona.openrepository.com/arizona/feed... | NaN | The UA Campus Repository is an institutional r... | TRUE | TRUE | FALSE | The University of Arizona | http://www.arizona.edu/ | us | Tucson | NaN | NaN | dspace | geoname_2_US | other | NaN | 2014-02-25 20:17:47 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [celestial, opendoar] | [5404, http://opendoar.org/id/2468/] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_7902 |
4 | dedup::044edcd1c961b3942a7e0e90d1005e2d | opendoar::2468 | 2468 | university of arizona campus repository | OpenDOAR | OpenDOAR_2468 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2468 | {"name": "university of arizona campus reposit... | [] | http://arizona.openrepository.com/arizona/ | NaN | institutional | [] | 2022-01-12 15:35:32 | 2012-05-02 09:50:07 | [science, arts, humanities, health and medicin... | [journal_articles, theses_and_dissertations, u... | [{'name': 'university of arizona', 'alternativ... | [] | {"name": "other", "version": ""} | NaN | yes | NaN | NaN | 63231.0 | OpenDOAR_2468 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [35]:
dup_across = dup_across.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_across = dup_across.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_across = dup_across.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_across = dup_across.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_across.head()
Out[35]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.metadata.cross-references | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.url-for-logo | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.data-access-condition.type | FAIRsharing_attributes.metadata.data-contact-information | FAIRsharing_attributes.metadata.data-deposition-condition.url | FAIRsharing_attributes.metadata.data-deposition-condition.type | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.data-access-condition.url | FAIRsharing_attributes.metadata.resource-sustainability.url | FAIRsharing_attributes.metadata.resource-sustainability.name | FAIRsharing_attributes.metadata.data-preservation-policy.url | FAIRsharing_attributes.metadata.data-preservation-policy.name | FAIRsharing_attributes.metadata.data-access-for-pre-publication-review | FAIRsharing_attributes.metadata.data-versioning | FAIRsharing_attributes.metadata.data-curation.type | FAIRsharing_attributes.metadata.data-curation.url | FAIRsharing_attributes.metadata.citation-to-related-publications | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_repository_metadata.repository_status | OpenDOAR_repository_metadata.fulltext_record_count | OpenDOAR_repository_metadata.metadata_record_count | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::001e6d882e54c780ce269d3c46997287 | https://fairsharing.org/10.25504/FAIRsharing.q... | 2094 | RESID Database of Protein Modifications | FAIRsharing | FAIRsharing_2094 | 2094 | fairsharing-records | 2014-11-04T15:23:40.000Z | 2021-12-06T10:49:03.952Z | 10.25504/FAIRsharing.qaszjp | RESID Database of Protein Modifications | ready | [{'contact-name': 'John S Garavelli', 'contact... | http://pir.georgetown.edu/resid/ | 2094.0 | The RESID Database of Protein Modifications is... | RESID | [{'url': 'http://pir.georgetown.edu/resid/faq.... | NaN | [{'url': 'ftp://ftp.pir.georgetown.edu/pir_dat... | [{'url': 'https://www.re3data.org/repository/r... | [biodbcore-000563, bsg-d000563] | Database | knowledgebase | [Life Science] | [Molecular structure, Small molecule, Structur... | [All] | [] | [United Kingdom, European Union, Switzerland] | FAIRsharing record for: RESID Database of Prot... | RESID | https://fairsharing.org/10.25504/FAIRsharing.q... | 10.25504/FAIRsharing.qaszjp | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: The RESID D... | [{'id': 334, 'pubmed_id': 12520062, 'title': '... | [{'licence-name': 'Open Data Commons (ODC) Pub... | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_2094 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | dedup::001e6d882e54c780ce269d3c46997287 | re3data::r3d100011306 | r3d100011306 | RESID Database of Protein Modifications | re3data | re3data_r3d100011306 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | r3d100011306 | RESID Database of Protein Modifications | eng | [] | https://pir.georgetown.edu/resid/resid.shtml | [FAIRsharing_doi:10.25504/FAIRsharing.qaszjp, ... | ["pirmail@georgetown.edu"] | The RESID Database of Protein Modifications is... | eng | [disciplinary] | {"size": "", "updatedp": ""} | 2014 | NaN | ["eng"] | [{'name': '2 Life Sciences', 'scheme': 'DFG'},... | NaN | [{'name': 'Images', 'scheme': 'parse'}, {'name... | [dataProvider] | [genomes, life sciences, proteins, proteomes, ... | [{'institutionName': 'Georgetown University, M... | [{"policyName": "Terms of Use", "policyURL": "... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "Copyrights", "dataLicens... | closed | [] | ["unknown"] | yes | {"api": "ftp://ftp.pir.georgetown.edu/database... | ["none"] | NaN | [] | yes | unknown | [] | [] | {} | RESID is covered by Thomson Reuters Data Citat... | 2014-12-05 | 2019-01-17 | re3data_r3d100011306 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | dedup::003ab6b40af9b488decea7c582d150a2 | re3data::r3d100011894 | r3d100011894 | Synapse | re3data | re3data_r3d100011894 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | r3d100011894 | Synapse | eng | [] | https://www.synapse.org | [FAIRsharing_DOI:10.25504/FAIRsharing.dnxzmk, ... | ["synapseinfo@sagebase.org"] | Synapse is an open source software platform th... | eng | [other] | {"size": "", "updatedp": ""} | 2012-05-22 | NaN | ["eng"] | [{'name': '2 Life Sciences', 'scheme': 'DFG'},... | https://sagebionetworks.org/tools_resources/sy... | [{'name': 'Raw data', 'scheme': 'parse'}, {'na... | [dataProvider, serviceProvider] | [AMP-AD Knowledge Portal, DREAM Challenges, Gi... | [{'institutionName': 'Alfred P. Sloan Foundati... | [{"policyName": "Synapse Commons Governance Ov... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "closed", "dataAccessRestr... | [{"dataLicenseName": "other", "dataLicenseURL"... | restricted | [] | ["unknown"] | yes | {"api": "https://docs.synapse.org/rest/", "api... | ["DOI"] | NaN | [] | yes | yes | [] | [] | {} | NaN | 2015-12-03 | 2021-11-16 | re3data_r3d100011894 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | dedup::003ab6b40af9b488decea7c582d150a2 | https://fairsharing.org/10.25504/FAIRsharing.d... | 2315 | Synapse | FAIRsharing | FAIRsharing_2315 | 2315 | fairsharing-records | 2016-08-02T13:56:30.000Z | 2021-12-06T10:48:25.700Z | 10.25504/FAIRsharing.dnxzmk | Synapse | ready | [{'contact-name': 'Meredith Slota', 'contact-e... | https://www.synapse.org/ | 2315.0 | Synapse is a collaborative research platform t... | Synapse | [{'url': 'SynapseInfo@sagebase.org', 'name': '... | 2010.0 | [{'url': 'https://www.synapse.org/', 'name': '... | [{'url': 'https://www.re3data.org/repository/r... | [biodbcore-000791, bsg-d000791] | Database | repository | [Data Integration, Data Management, Biomedical... | [Experimental measurement, Protocol, Data stor... | [All] | [] | [United States] | FAIRsharing record for: Synapse | Synapse | https://fairsharing.org/10.25504/FAIRsharing.d... | 10.25504/FAIRsharing.dnxzmk | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: Synapse is ... | [{'id': 2450, 'pubmed_id': 24071850, 'title': ... | [{'licence-name': 'Creative Commons Attributio... | None | NaN | [{'url': 'https://sage-bionetworks.github.io/r... | NaN | NaN | NaN | NaN | NaN | NaN | [{'url': 'http://rest-docs.synapse.org/rest/',... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_2315 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | dedup::0048f2e3aa55ab88aaaac0cfa4153ad5 | opendoar::4562 | 4562 | erzincan binali yıldırım university institutio... | OpenDOAR | OpenDOAR_4562 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4562 | {"name": "erzincan binali y\u0131ld\u0131r\u01... | [] | http://earsiv.erzincan.edu.tr | NaN | institutional | [] | 2022-01-12 15:36:06 | 2019-04-24 09:06:10 | [social sciences] | [journal_articles] | [{'name': 'erzincan binali yıldırım university... | [] | {"name": "dspace", "version": ""} | http://earsiv.erzincan.edu.tr/oai | yes | NaN | NaN | NaN | OpenDOAR_4562 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [36]:
dup_within = dup_within.groupby('dedup_id').aggregate(list).reset_index()
dup_within['source_set'] = dup_within.source.map(set)
dup_within.head()
Out[36]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.metadata.cross-references | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.url-for-logo | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.data-access-condition.type | FAIRsharing_attributes.metadata.data-contact-information | FAIRsharing_attributes.metadata.data-deposition-condition.url | FAIRsharing_attributes.metadata.data-deposition-condition.type | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.data-access-condition.url | FAIRsharing_attributes.metadata.resource-sustainability.url | FAIRsharing_attributes.metadata.resource-sustainability.name | FAIRsharing_attributes.metadata.data-preservation-policy.url | FAIRsharing_attributes.metadata.data-preservation-policy.name | FAIRsharing_attributes.metadata.data-access-for-pre-publication-review | FAIRsharing_attributes.metadata.data-versioning | FAIRsharing_attributes.metadata.data-curation.type | FAIRsharing_attributes.metadata.data-curation.url | FAIRsharing_attributes.metadata.citation-to-related-publications | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_repository_metadata.repository_status | OpenDOAR_repository_metadata.fulltext_record_count | OpenDOAR_repository_metadata.metadata_record_count | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | source_set | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::07b65089515c8f99812d14bbb01334a6 | [roar::474, roar::5541] | [474, 5541] | [ECNIS Repository (Environmental Cancer Risk, ... | [roar, roar] | [roar_474, roar_5541] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [474, 5541] | [281, 8] | [archive, archive] | [1, 8] | [nan, nan] | [nan, nan] | [disk0/00/00/04/74, disk0/00/00/55/41] | [2010-01-06 13:44:22, 2012-12-12 01:21:03] | [2011-07-06 08:19:53, 2012-12-15 02:51:35] | [2010-01-06 13:44:22, 2012-12-12 01:21:03] | [other, institutional] | [nan, nan] | [nan, nan] | [show, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [0, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://ecnis.openrepository.com/, http://ecni... | [ECNIS Repository (Environmental Cancer Risk, ... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [This site is a subject specific repository co... | [TRUE, nan] | [TRUE, nan] | [nan, nan] | [ECNIS (Environmental Cancer Risk, Nutrition a... | [http://www.ecnis.org, http://www.ecnis.org/] | [pl, pl] | [Lodz, nan] | [51.8, 51.8] | [19.5, 19.5] | [openrepo, nan] | [geoname_2_PL, geoname_2_PL] | [other, other] | [nan, nan] | [2008-06-03 08:05:43, 2012-07-01 15:13:36] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [opendoar, opendoar] | [1254, 1254] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_474, roar_5541] | {roar} |
1 | dedup::0be44aa69610e09805d4002baf7e0b10 | [roar::16867, roar::2907] | [16867, 2907] | [Chung Shan Medical University Institutional R... | [roar, roar] | [roar_16867, roar_2907] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [16867, 2907] | [3, 548] | [archive, archive] | [360, 360] | [nan, nan] | [nan, nan] | [disk0/00/01/68/67, disk0/00/00/29/07] | [2021-02-25 13:06:19, 2010-07-29 01:40:55] | [2021-02-25 13:06:19, 2021-02-17 06:33:34] | [2021-02-25 13:06:19, 2010-07-29 01:40:55] | [institutional, institutional] | [2907, nan] | [nan, nan] | [show, no_search] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [0, 0] | [nan, nan] | [nan, nan] | [nan, nan] | [https://ir.csmu.edu.tw:8080, https://ir.csmu.... | [Chung Shan Medical University Institutional R... | [https://ir.csmu.edu.tw:8080/ir-oai/request?ve... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [TRUE, TRUE] | [TRUE, TRUE] | [FALSE, FALSE] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [dspace, dspace] | [nan, nan] | [other, other] | [[RT, RC0254, RC1200, R1, RK], [RC0321, RT, RC... | [2009-10-21 00:00:00, 2009-10-21 00:00:00] | [nan, nan] | [nan, nan] | [0, 0] | [0, 0] | [0, 0] | [100, 100] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [celestial, celestial] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_16867, roar_2907] | {roar} |
2 | dedup::0c34770edc42a1d2ac361b64cfabfb63 | [roar::5432, roar::4030] | [5432, 4030] | [Digital Library of Jelenia Góra, Digital Libr... | [roar, roar] | [roar_5432, roar_4030] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [5432, 4030] | [9, 12] | [archive, archive] | [8, 8] | [nan, nan] | [nan, nan] | [disk0/00/00/54/32, disk0/00/00/40/30] | [2012-11-19 20:28:01, 2011-08-02 23:17:15] | [2012-11-26 06:53:38, 2012-02-06 06:58:00] | [2012-11-19 20:28:01, 2011-08-02 23:17:15] | [institutional, institutional] | [nan, nan] | [nan, nan] | [show, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://jbc.jelenia-gora.pl/dlibra.html, http:... | [Digital Library of Jelenia Góra, Digital Libr... | [http://jbc.jelenia-gora.pl/dlibra/oai-pmh-rep... | [nan, nan] | [nan, nan] | [nan, nan] | [Users may set up RSS feeds to be alerted to n... | [nan, nan] | [nan, nan] | [nan, nan] | [Jeleniogórskie Centrum Informacji i Edukacji ... | [http://biblioteka.jelenia-gora.pl/, http://bi... | [pl, pl] | [nan, nan] | [50.9012, 50.9012] | [15.7341, 15.7341] | [nan, nan] | [geoname_2_PL, geoname_2_PL] | [other, other] | [nan, nan] | [2012-07-01 15:12:22, 2009-10-21 11:09:50] | [nan, nan] | [nan, nan] | [0, 0] | [0, 0] | [0, 0] | [20, 20] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | celestial, opendoar], [celestial, opendoar | 4595, 2211], [4595, 2211 | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_5432, roar_4030] | {roar} |
3 | dedup::0c6ed4b110c461d9350bf5c620bc78d7 | [roar::3020, roar::3401, roar::5252] | [3020, 3401, 5252] | [KCE Repository, KCE Repository, KCE Repository] | [roar, roar, roar] | [roar_3020, roar_3401, roar_5252] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [3020, 3401, 5252] | [260, 82, 10] | [archive, archive, archive] | [8, 8, 8] | [nan, nan, nan] | [nan, nan, nan] | [disk0/00/00/30/20, disk0/00/00/34/01, disk0/0... | [2010-09-13 09:52:23, 2010-12-20 21:30:30, 201... | [2016-04-17 21:53:51, 2016-04-17 21:51:59, 201... | [2010-09-13 09:52:22, 2010-12-20 21:30:30, 201... | [other, other, other] | [nan, nan, nan] | [nan, nan, nan] | [show, show, show] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [0, 0, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [http://kce.docressources.info/opac/index.php?... | [KCE Repository, KCE Repository, KCE Repository] | [http://kce.docressources.info/ws/PMBWs_2, htt... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [This site provides access to the publication ... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [Belgian Health Care Knowledge Centre, Belgian... | [http://www.kce.fgov.be/, http://www.kce.fgov.... | [be, be, be] | [nan, nan, nan] | [50.8463, 50.8463, 50.8463] | [4.3547, 4.3547, 4.3547] | [nan, nan, nan] | [nan, nan, geoname_2_BE] | [other, other, other] | [nan, nan, nan] | [2009-01-19 09:04:11, 2009-01-19 09:04:11, 201... | [nan, nan, nan] | [nan, nan, nan] | [0, 0, 0] | [0, 0, 0] | [0, 0, 0] | [250, 250, 250] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [[celestial, opendoar], [celestial, opendoar],... | 2246, 1879], [2246, 1879], [2246, 1879 | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... | [909, 909, 909] | [nan, nan, nan] | [roar_3020, roar_3401, roar_5252] | {roar} |
4 | dedup::0e3c63baca694032044bbb00c2f1111e | [roar::8405, roar::8716] | [8405, 8716] | [Content Pro IRX, Content Pro IRX] | [roar, roar] | [roar_8405, roar_8716] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [8405, 8716] | [17, 12] | [archive, archive] | [5386, 5386] | [nan, nan] | [nan, nan] | [disk0/00/00/84/05, disk0/00/00/87/16] | [2014-06-24 10:13:16, 2014-10-08 18:39:33] | [2014-06-28 01:36:04, 2014-10-11 01:36:34] | [2014-06-24 10:13:16, 2014-10-08 18:39:33] | [institutional, institutional] | [nan, nan] | [nan, nan] | [show, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://encore.tut.ac.za/iii/cpro, http://enco... | [Content Pro IRX, Content Pro IRX] | [nan, http://encore.tut.ac.za/iii/oairep/OAIRe... | [http://encore.tut.ac.za/iii/cpro/, nan] | [nan, nan] | [nan, nan] | [Tshwane University of Technology Digital Open... | [TRUE, TRUE] | [TRUE, TRUE] | [TRUE, FALSE] | [Tshwane University of Technology, Tshwane Uni... | [http://lib.tut.ac.za, http://tut.ac.za] | [za, za] | [Pretoria, Pretoria] | [-25, 25] | [28, 28] | [other, nan] | [geoname_2_ZA, geoname_2_ZA] | [other, other] | [nan, nan] | [2014-05-26 13:47:54, 2014-07-24 06:31:10] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [opendoar, celestial] | [3078, 5657] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_8405, roar_8716] | {roar} |
In [37]:
dup_hybrid = dup_hybrid.groupby('dedup_id').aggregate(list).reset_index()
dup_hybrid['source_set'] = dup_hybrid.source.map(set)
dup_hybrid.head()
Out[37]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.metadata.cross-references | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.url-for-logo | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.data-access-condition.type | FAIRsharing_attributes.metadata.data-contact-information | FAIRsharing_attributes.metadata.data-deposition-condition.url | FAIRsharing_attributes.metadata.data-deposition-condition.type | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.data-access-condition.url | FAIRsharing_attributes.metadata.resource-sustainability.url | FAIRsharing_attributes.metadata.resource-sustainability.name | FAIRsharing_attributes.metadata.data-preservation-policy.url | FAIRsharing_attributes.metadata.data-preservation-policy.name | FAIRsharing_attributes.metadata.data-access-for-pre-publication-review | FAIRsharing_attributes.metadata.data-versioning | FAIRsharing_attributes.metadata.data-curation.type | FAIRsharing_attributes.metadata.data-curation.url | FAIRsharing_attributes.metadata.citation-to-related-publications | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_repository_metadata.repository_status | OpenDOAR_repository_metadata.fulltext_record_count | OpenDOAR_repository_metadata.metadata_record_count | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | source_set | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::038ef33e8d3de51d3536d62e6c103be7 | [roar::6167, opendoar::2717, roar::6580] | [6167, 2717, 6580] | [Institutional Repository UIN Syarif Hidayatul... | [roar, OpenDOAR, roar] | [roar_6167, OpenDOAR_2717, roar_6580] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, 2717, nan] | [nan, {"name": "institutional repository uin s... | [nan, [], nan] | [nan, http://repository.uinjkt.ac.id/dspace/, ... | [nan, nan, nan] | [nan, institutional, nan] | [nan, [], nan] | [nan, 2022-01-12 15:35:36, nan] | [nan, 2013-07-11 15:52:01, nan] | [nan, [science, arts, humanities, social scien... | [nan, [theses_and_dissertations], nan] | [nan, [{'name': 'uin syarif hidayatullah jakar... | [nan, [], nan] | [nan, {"name": "dspace", "version": ""}, nan] | [nan, http://repository.uinjkt.ac.id/oai/, nan] | [nan, yes, nan] | [nan, nan, nan] | [nan, 0.0, nan] | [nan, 36862.0, nan] | [nan, OpenDOAR_2717, nan] | [6167, nan, 6580] | [13, nan, 16] | [archive, nan, archive] | [2178, nan, 2040] | [nan, nan, nan] | [nan, nan, nan] | [disk0/00/00/61/67, nan, disk0/00/00/65/80] | [2012-12-12 05:42:58, nan, 2013-03-31 16:02:51] | [2013-07-14 15:12:12, nan, 2013-04-06 01:42:14] | [2012-12-12 05:42:58, nan, 2013-03-31 16:02:51] | [institutional, nan, institutional] | [nan, nan, nan] | [nan, nan, nan] | [show, nan, show] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [http://repository.uinjkt.ac.id, nan, http://r... | [Institutional Repository UIN Syarif Hidayatul... | [[http://repository.uinjkt.ac.id/oai, http://r... | [http://repository.uinjkt.ac.id/sword/, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nstitutional Repository UIN Syarif Hidayatull... | [TRUE, nan, FALSE] | [TRUE, nan, FALSE] | [FALSE, nan, FALSE] | [[UIN Syarif Hidayatullah Jakarta, Pascasarjan... | [[http://www.uinjkt.ac.id, http://graduate.uin... | [id, nan, id] | [Jakarta, nan, Jakarta] | [nan, nan, 106.756] | [nan, nan, -6.30591] | [dspace, nan, dspace] | [geoname_2_ID, nan, geoname_2_ID] | [other, nan, other] | [AI, nan, [AC, Z665, Z004, Z719, BP, Q1]] | [2012-11-07 08:11:19, nan, 2013-03-04 07:20:37] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [[celestial, opendoar], nan, nan] | [[5108, 2717, 5109], nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [roar_6167, nan, roar_6580] | {OpenDOAR, roar} |
1 | dedup::044edcd1c961b3942a7e0e90d1005e2d | [roar::7902, opendoar::2468, roar::5216] | [7902, 2468, 5216] | [The University of Arizona Campus Repository, ... | [roar, OpenDOAR, roar] | [roar_7902, OpenDOAR_2468, roar_5216] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, 2468, nan] | [nan, {"name": "university of arizona campus r... | [nan, [], nan] | [nan, http://arizona.openrepository.com/arizon... | [nan, nan, nan] | [nan, institutional, nan] | [nan, [], nan] | [nan, 2022-01-12 15:35:32, nan] | [nan, 2012-05-02 09:50:07, nan] | [nan, [science, arts, humanities, health and m... | [nan, [journal_articles, theses_and_dissertati... | [nan, [{'name': 'university of arizona', 'alte... | [nan, [], nan] | [nan, {"name": "other", "version": ""}, nan] | [nan, nan, nan] | [nan, yes, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, 63231.0, nan] | [nan, OpenDOAR_2468, nan] | [7902, nan, 5216] | [12, nan, 8] | [archive, nan, archive] | [4910, nan, 8] | [nan, nan, nan] | [nan, nan, nan] | [disk0/00/00/79/02, nan, disk0/00/00/52/16] | [2014-03-05 11:50:29, nan, 2012-05-16 23:47:28] | [2014-05-08 13:10:29, nan, 2012-05-19 01:46:06] | [2014-03-05 11:50:29, nan, 2012-05-16 23:47:28] | [institutional, nan, institutional] | [nan, nan, nan] | [nan, nan, nan] | [show, nan, show] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [http://arizona.openrepository.com/arizona/, n... | [The University of Arizona Campus Repository, ... | [http://arizona.openrepository.com/arizona/oai... | [nan, nan, nan] | [http://arizona.openrepository.com/arizona/fee... | [nan, nan, nan] | [The UA Campus Repository is an institutional ... | [TRUE, nan, nan] | [TRUE, nan, nan] | [FALSE, nan, nan] | [The University of Arizona, nan, University of... | [http://www.arizona.edu/, nan, http://www.ariz... | [us, nan, us] | [Tucson, nan, nan] | [nan, nan, 32.2531] | [nan, nan, -110.948] | [dspace, nan, nan] | [geoname_2_US, nan, geoname_2_US] | [other, nan, other] | [nan, nan, nan] | [2014-02-25 20:17:47, nan, 2012-05-13 15:12:37] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [[celestial, opendoar], nan, opendoar] | [[5404, http://opendoar.org/id/2468/], nan, 2468] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [roar_7902, nan, roar_5216] | {OpenDOAR, roar} |
2 | dedup::0468c62a26a75be73109e1efa74bee44 | [roar::12182, opendoar::3096, roar::8677] | [12182, 3096, 8677] | [ScholarWorks @ UVM, scholarworks @ uvm, Schol... | [roar, OpenDOAR, roar] | [roar_12182, OpenDOAR_3096, roar_8677] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, 3096, nan] | [nan, {"name": "scholarworks @ uvm", "language... | [nan, [], nan] | [nan, http://scholarworks.uvm.edu/, nan] | [nan, nan, nan] | [nan, institutional, nan] | [nan, [], nan] | [nan, 2022-01-12 15:35:42, nan] | [nan, 2014-06-26 16:50:45, nan] | [nan, [science, technology, engineering, mathe... | [nan, [journal_articles, conference_and_worksh... | [nan, [{'name': 'university of vermont', 'alte... | [nan, [{"policy_url": "http://scholarworks.uvm... | [nan, {"name": "other", "version": ""}, nan] | [nan, http://scholarworks.uvm.edu/do/oai/, nan] | [nan, yes, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, 2871.0, nan] | [nan, OpenDOAR_3096, nan] | [12182, nan, 8677] | [11, nan, 11] | [archive, nan, archive] | [404, nan, 5634] | [nan, nan, nan] | [nan, nan, nan] | [disk0/00/01/21/82, nan, disk0/00/00/86/77] | [2017-03-11 17:50:59, nan, 2014-10-08 18:32:00] | [2017-03-18 02:36:55, nan, 2014-10-11 01:36:04] | [2017-03-11 17:50:59, nan, 2014-10-08 18:32:00] | [institutional, nan, institutional] | [nan, nan, nan] | [nan, nan, nan] | [show, nan, show] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [http://scholarworks.uvm.edu/, nan, http://sch... | [ScholarWorks @ UVM, nan, ScholarWorks @ UVM] | [http://scholarworks.uvm.edu/do/oai/, nan, htt... | [nan, nan, nan] | [http://scholarworks.uvm.edu/recent.rss, nan, ... | [nan, nan, nan] | [ScholarWorks @ UVM collects, preserves, and s... | [TRUE, nan, TRUE] | [TRUE, nan, TRUE] | [FALSE, nan, FALSE] | [University of Vermont, nan, University of Ver... | [https://www.uvm.edu/, nan, http://www.uvm.edu] | [us, nan, us] | [Burlington, VT, nan, Burington] | [44.4759, nan, 44.4856] | [-73.2121, nan, -73.2117] | [bepress, nan, bepress] | [geoname_2_US, nan, geoname_2_US] | [other, nan, other] | [nan, nan, nan] | [2017-01-13 20:44:06, nan, 2014-07-16 21:08:43] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [celestial, nan, [celestial, opendoar]] | [5654, nan, [http://opendoar.org/id/3096/, 5654]] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [roar_12182, nan, roar_8677] | {OpenDOAR, roar} |
3 | dedup::053eb8ab14c76525fd6f1daeb061f064 | [opendoar::9528, roar::15805, roar::15765] | [9528, 15805, 15765] | [repositorio institucional históricas - unam, ... | [OpenDOAR, roar, roar] | [OpenDOAR_9528, roar_15805, roar_15765] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [9528, nan, nan] | [{"name": "repositorio institucional hist\u00f... | [[], nan, nan] | [http://ru.historicas.unam.mx, nan, nan] | [nan, nan, nan] | [institutional, nan, nan] | [[], nan, nan] | [2022-01-12 15:36:31, nan, nan] | [2020-02-25 08:36:10, nan, nan] | [[humanities, technology], nan, nan] | [[journal_articles, other_special_item_types],... | [[{'name': 'unam', 'alternativeName': 'institu... | [[], nan, nan] | [{"name": "dspace", "version": ""}, nan, nan] | [http://ru.historicas.unam.mx/oai/request, nan... | [yes, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [OpenDOAR_9528, nan, nan] | [nan, 15805, 15765] | [nan, 5, 19] | [nan, archive, archive] | [nan, 12662, 12662] | [nan, nan, nan] | [nan, nan, nan] | [nan, disk0/00/01/58/05, disk0/00/01/57/65] | [nan, 2020-10-19 15:32:48, 2020-10-19 15:31:52] | [nan, 2021-01-25 22:20:40, 2021-01-26 20:47:24] | [nan, 2020-10-19 15:32:48, 2020-10-19 15:31:52] | [nan, institutional, institutional] | [nan, 15765, nan] | [nan, nan, nan] | [nan, show, no_search] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, http://ru.historicas.unam.mx/, https://r... | [nan, Repositorio Institucional Históricas-UNA... | [nan, http://ru.historicas.unam.mx/oai/request... | [nan, nan, nan] | [nan, http://ru.historicas.unam.mx/feed/rss_1.... | [nan, nan, nan] | [nan, El Repositorio Institucional Históricas-... | [nan, TRUE, TRUE] | [nan, TRUE, TRUE] | [nan, TRUE, TRUE] | [nan, Instituto de Investigaciones Históricas,... | [nan, http://www.historicas.unam.mx/, http://w... | [nan, mx, mx] | [nan, Ciudad de México, Ciudad de México] | [nan, nan, nan] | [nan, nan, nan] | [nan, dspace, dspace] | [nan, geoname_2_MX, geoname_2_MX] | [nan, other, other] | [nan, [D1, E11, F1201, D111, D901, DP, D204, D... | [nan, 2020-02-14 18:36:03, 2020-02-14 18:36:03] | [nan, ¿Quién puede depositar documentos en el ... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, opendoar] | [nan, nan, 9528] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, [russell_group, ivy_league], [russell_gr... | [nan, roar_15805, roar_15765] | {OpenDOAR, roar} |
4 | dedup::06a4be0dca480e71b823fd599ed221a0 | [opendoar::2557, roar::5840, roar::5915] | [2557, 5840, 5915] | [biblioteka cyfrowa diecezji legnickiej, Bibli... | [OpenDOAR, roar, roar] | [OpenDOAR_2557, roar_5840, roar_5915] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [2557, nan, nan] | [{"name": "biblioteka cyfrowa diecezji legnick... | [[], nan, nan] | [http://bcdl.pl/dlibra, nan, nan] | [nan, nan, nan] | [institutional, nan, nan] | [[], nan, nan] | [2022-01-12 15:35:34, nan, nan] | [2012-08-20 11:35:42, nan, nan] | [[humanities], nan, nan] | [[journal_articles, books_chapters_and_section... | [[{'name': 'biblioteka wyższego seminarium duc... | [[], nan, nan] | [{"name": "dlibra", "version": ""}, nan, nan] | [http://bcdl.pl/dlibra/oai-pmh-repository.xml,... | [yes, nan, nan] | [nan, nan, nan] | [0.0, nan, nan] | [368.0, nan, nan] | [OpenDOAR_2557, nan, nan] | [nan, 5840, 5915] | [nan, 9, 9] | [nan, archive, archive] | [nan, 8, 8] | [nan, nan, nan] | [nan, nan, nan] | [nan, disk0/00/00/58/40, disk0/00/00/59/15] | [nan, 2012-12-12 04:59:36, 2012-12-12 05:12:30] | [nan, 2012-12-17 06:53:45, 2012-12-17 06:53:50] | [nan, 2012-12-12 04:59:36, 2012-12-12 05:12:30] | [nan, institutional, institutional] | [nan, nan, nan] | [nan, nan, nan] | [nan, show, show] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, http://bcdl.pl/, http://bcdl.pl/dlibra] | [nan, Biblioteka Cyfrowa Diecezji Legnickiej, ... | [nan, http://bcdl.pl/dlibra/oai-pmh-repository... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, This site provides access to the digitis... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, Biblioteka Wyższego Seminarium Duchowneg... | [nan, http://www.biblioteka.diecezja.legnica.p... | [nan, pl, pl] | [nan, nan, nan] | [nan, 51.207, 51.207] | [nan, 16.1553, 16.1553] | [nan, nan, nan] | [nan, geoname_2_PL, geoname_2_PL] | [nan, other, other] | [nan, nan, nan] | [nan, 2012-08-26 15:12:13, 2012-09-16 15:12:16] | [nan, nan, nan] | [nan, nan, nan] | [nan, 0, 0] | [nan, 0, 0] | [nan, 0, 0] | [nan, 20, 20] | [nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, [celestial, opendoar], [celestial, opend... | [nan, [2557, 5081], [2557, 5081]] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, roar_5840, roar_5915] | {OpenDOAR, roar} |
In [38]:
dup_across = dup_across.groupby('dedup_id').aggregate(list).reset_index()
dup_across['source_set'] = dup_across.source.map(set)
dup_across.head()
Out[38]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.metadata.cross-references | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.url-for-logo | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.data-access-condition.type | FAIRsharing_attributes.metadata.data-contact-information | FAIRsharing_attributes.metadata.data-deposition-condition.url | FAIRsharing_attributes.metadata.data-deposition-condition.type | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.data-access-condition.url | FAIRsharing_attributes.metadata.resource-sustainability.url | FAIRsharing_attributes.metadata.resource-sustainability.name | FAIRsharing_attributes.metadata.data-preservation-policy.url | FAIRsharing_attributes.metadata.data-preservation-policy.name | FAIRsharing_attributes.metadata.data-access-for-pre-publication-review | FAIRsharing_attributes.metadata.data-versioning | FAIRsharing_attributes.metadata.data-curation.type | FAIRsharing_attributes.metadata.data-curation.url | FAIRsharing_attributes.metadata.citation-to-related-publications | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_repository_metadata.repository_status | OpenDOAR_repository_metadata.fulltext_record_count | OpenDOAR_repository_metadata.metadata_record_count | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | source_set | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::001e6d882e54c780ce269d3c46997287 | [https://fairsharing.org/10.25504/FAIRsharing.... | [2094, r3d100011306] | [RESID Database of Protein Modifications, RESI... | [FAIRsharing, re3data] | [FAIRsharing_2094, re3data_r3d100011306] | [2094, nan] | [fairsharing-records, nan] | [2014-11-04T15:23:40.000Z, nan] | [2021-12-06T10:49:03.952Z, nan] | [10.25504/FAIRsharing.qaszjp, nan] | [RESID Database of Protein Modifications, nan] | [ready, nan] | [[{'contact-name': 'John S Garavelli', 'contac... | [http://pir.georgetown.edu/resid/, nan] | [2094.0, nan] | [The RESID Database of Protein Modifications i... | [RESID, nan] | [[{'url': 'http://pir.georgetown.edu/resid/faq... | [nan, nan] | [[{'url': 'ftp://ftp.pir.georgetown.edu/pir_da... | [[{'url': 'https://www.re3data.org/repository/... | [[biodbcore-000563, bsg-d000563], nan] | [Database, nan] | [knowledgebase, nan] | [[Life Science], nan] | [[Molecular structure, Small molecule, Structu... | [[All], nan] | [[], nan] | [[United Kingdom, European Union, Switzerland]... | [FAIRsharing record for: RESID Database of Pro... | [RESID, nan] | [https://fairsharing.org/10.25504/FAIRsharing.... | [10.25504/FAIRsharing.qaszjp, nan] | [https://creativecommons.org/licenses/by-sa/4.... | [This FAIRsharing record describes: The RESID ... | [[{'id': 334, 'pubmed_id': 12520062, 'title': ... | [[{'licence-name': 'Open Data Commons (ODC) Pu... | [None, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [FAIRsharing_2094, nan] | [nan, r3d100011306] | [nan, RESID Database of Protein Modifications] | [nan, eng] | [nan, []] | [nan, https://pir.georgetown.edu/resid/resid.s... | [nan, [FAIRsharing_doi:10.25504/FAIRsharing.qa... | [nan, ["pirmail@georgetown.edu"]] | [nan, The RESID Database of Protein Modificati... | [nan, eng] | [nan, [disciplinary]] | [nan, {"size": "", "updatedp": ""}] | [nan, 2014] | [nan, nan] | [nan, ["eng"]] | [nan, [{'name': '2 Life Sciences', 'scheme': '... | [nan, nan] | [nan, [{'name': 'Images', 'scheme': 'parse'}, ... | [nan, [dataProvider]] | [nan, [genomes, life sciences, proteins, prote... | [nan, [{'institutionName': 'Georgetown Univers... | [nan, [{"policyName": "Terms of Use", "policyU... | [nan, {"databaseAccessType": "open", "databas... | [nan, []] | [nan, [{"dataAccessType": "open", "dataAccessR... | [nan, [{"dataLicenseName": "Copyrights", "data... | [nan, closed] | [nan, []] | [nan, ["unknown"]] | [nan, yes] | [nan, {"api": "ftp://ftp.pir.georgetown.edu/da... | [nan, ["none"]] | [nan, nan] | [nan, []] | [nan, yes] | [nan, unknown] | [nan, []] | [nan, []] | [nan, {}] | [nan, RESID is covered by Thomson Reuters Data... | [nan, 2014-12-05] | [nan, 2019-01-17] | [nan, re3data_r3d100011306] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | {FAIRsharing, re3data} |
1 | dedup::003ab6b40af9b488decea7c582d150a2 | [re3data::r3d100011894, https://fairsharing.or... | [r3d100011894, 2315] | [Synapse, Synapse] | [re3data, FAIRsharing] | [re3data_r3d100011894, FAIRsharing_2315] | [nan, 2315] | [nan, fairsharing-records] | [nan, 2016-08-02T13:56:30.000Z] | [nan, 2021-12-06T10:48:25.700Z] | [nan, 10.25504/FAIRsharing.dnxzmk] | [nan, Synapse] | [nan, ready] | [nan, [{'contact-name': 'Meredith Slota', 'con... | [nan, https://www.synapse.org/] | [nan, 2315.0] | [nan, Synapse is a collaborative research plat... | [nan, Synapse] | [nan, [{'url': 'SynapseInfo@sagebase.org', 'na... | [nan, 2010.0] | [nan, [{'url': 'https://www.synapse.org/', 'na... | [nan, [{'url': 'https://www.re3data.org/reposi... | [nan, [biodbcore-000791, bsg-d000791]] | [nan, Database] | [nan, repository] | [nan, [Data Integration, Data Management, Biom... | [nan, [Experimental measurement, Protocol, Dat... | [nan, [All]] | [nan, []] | [nan, [United States]] | [nan, FAIRsharing record for: Synapse] | [nan, Synapse] | [nan, https://fairsharing.org/10.25504/FAIRsha... | [nan, 10.25504/FAIRsharing.dnxzmk] | [nan, https://creativecommons.org/licenses/by-... | [nan, This FAIRsharing record describes: Synap... | [nan, [{'id': 2450, 'pubmed_id': 24071850, 'ti... | [nan, [{'licence-name': 'Creative Commons Attr... | [nan, None] | [nan, nan] | [nan, [{'url': 'https://sage-bionetworks.githu... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, [{'url': 'http://rest-docs.synapse.org/r... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, FAIRsharing_2315] | [r3d100011894, nan] | [Synapse, nan] | [eng, nan] | [[], nan] | [https://www.synapse.org, nan] | [[FAIRsharing_DOI:10.25504/FAIRsharing.dnxzmk,... | [["synapseinfo@sagebase.org"], nan] | [Synapse is an open source software platform t... | [eng, nan] | [[other], nan] | [{"size": "", "updatedp": ""}, nan] | [2012-05-22, nan] | [nan, nan] | [["eng"], nan] | [[{'name': '2 Life Sciences', 'scheme': 'DFG'}... | [https://sagebionetworks.org/tools_resources/s... | [[{'name': 'Raw data', 'scheme': 'parse'}, {'n... | [[dataProvider, serviceProvider], nan] | [[AMP-AD Knowledge Portal, DREAM Challenges, G... | [[{'institutionName': 'Alfred P. Sloan Foundat... | [[{"policyName": "Synapse Commons Governance O... | [ {"databaseAccessType": "open", "databaseAcce... | [[], nan] | [[{"dataAccessType": "closed", "dataAccessRest... | [[{"dataLicenseName": "other", "dataLicenseURL... | [restricted, nan] | [[], nan] | [["unknown"], nan] | [yes, nan] | [{"api": "https://docs.synapse.org/rest/", "ap... | [["DOI"], nan] | [nan, nan] | [[], nan] | [yes, nan] | [yes, nan] | [[], nan] | [[], nan] | [{}, nan] | [nan, nan] | [2015-12-03, nan] | [2021-11-16, nan] | [re3data_r3d100011894, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | {FAIRsharing, re3data} |
2 | dedup::0048f2e3aa55ab88aaaac0cfa4153ad5 | [opendoar::4562, roar::14673] | [4562, 14673] | [erzincan binali yıldırım university instituti... | [OpenDOAR, roar] | [OpenDOAR_4562, roar_14673] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [4562, nan] | [{"name": "erzincan binali y\u0131ld\u0131r\u0... | [[], nan] | [http://earsiv.erzincan.edu.tr, nan] | [nan, nan] | [institutional, nan] | [[], nan] | [2022-01-12 15:36:06, nan] | [2019-04-24 09:06:10, nan] | [[social sciences], nan] | [[journal_articles], nan] | [[{'name': 'erzincan binali yıldırım universit... | [[], nan] | [{"name": "dspace", "version": ""}, nan] | [http://earsiv.erzincan.edu.tr/oai, nan] | [yes, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [OpenDOAR_4562, nan] | [nan, 14673] | [nan, 9] | [nan, archive] | [nan, 11738] | [nan, nan] | [nan, nan] | [nan, disk0/00/01/46/73] | [nan, 2019-07-19 14:26:33] | [nan, 2019-07-22 08:07:03] | [nan, 2019-07-19 14:26:33] | [nan, institutional] | [nan, nan] | [nan, nan] | [nan, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, http://earsiv.erzincan.edu.tr] | [nan, Erzincan Binali Yıldırım University Inst... | [nan, http://earsiv.erzincan.edu.tr/oai] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, DSpace@Erzincan is a growing collection ... | [nan, TRUE] | [nan, TRUE] | [nan, TRUE] | [nan, Erzincan Binali Yıldırım University] | [nan, http://www.ebyu.edu.tr] | [nan, tr] | [nan, Erzincan] | [nan, 39.7463] | [nan, 39.5149] | [nan, dspace] | [nan, geoname_2_TR] | [nan, other] | [nan, nan] | [nan, 2019-04-18 19:09:41] | [nan, DSpace@Erzincan is a growing collection ... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, roar_14673] | {OpenDOAR, roar} |
3 | dedup::00a35b4a2495a342f5632d18cf5985f6 | [opendoar::6787, roar::13960] | [6787, 13960] | [scholarly commons university of the pacific, ... | [OpenDOAR, roar] | [OpenDOAR_6787, roar_13960] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [6787, nan] | [{"name": "scholarly commons university of the... | [[], nan] | [https://scholarlycommons.pacific.edu, nan] | [nan, nan] | [institutional, nan] | [[], nan] | [2022-01-12 15:36:16, nan] | [2019-09-28 02:20:20, nan] | [[science, technology, engineering, mathematic... | [[journal_articles, theses_and_dissertations, ... | [[{'name': 'university of the pacific', 'alter... | [[], nan] | [{"name": "digital_commons", "version": ""}, nan] | [https://scholarlycommons.pacific.edu/do/oai, ... | [yes, nan] | [nan, nan] | [6534.0, nan] | [59858.0, nan] | [OpenDOAR_6787, nan] | [nan, 13960] | [nan, 10] | [nan, archive] | [nan, 11103] | [nan, nan] | [nan, nan] | [nan, disk0/00/01/39/60] | [nan, 2018-10-07 12:48:04] | [nan, 2018-10-13 01:35:50] | [nan, 2018-10-07 12:48:04] | [nan, institutional] | [nan, nan] | [nan, nan] | [nan, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, https://scholarlycommons.pacific.edu/] | [nan, Scholarly Commons - University of the Pa... | [nan, https://scholarlycommons.pacific.edu/do/... | [nan, nan] | [nan, https://scholarlycommons.pacific.edu/rec... | [nan, nan] | [nan, Scholarly Commons is a service of the Un... | [nan, TRUE] | [nan, TRUE] | [nan, FALSE] | [nan, University of the Pacific] | [nan, https://www.pacific.edu/] | [nan, us] | [nan, [Sacramento, Stockton, San Francisco]] | [nan, nan] | [nan, nan] | [nan, bepress] | [nan, geoname_2_US] | [nan, other] | [nan, nan] | [nan, 2018-09-05 23:09:53] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, celestial] | [nan, 6700] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, roar_13960] | {OpenDOAR, roar} |
4 | dedup::00a6af15fba302b272b110ac88924779 | [roar::755, opendoar::1285] | [755, 1285] | [KFUPM ePrints, kfupm eprints] | [roar, OpenDOAR] | [roar_755, OpenDOAR_1285] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, 1285] | [nan, {"name": "kfupm eprints", "language": "e... | [nan, []] | [nan, http://eprints.kfupm.edu.sa/] | [nan, nan] | [nan, institutional] | [nan, []] | [nan, 2022-01-12 15:35:13] | [nan, 2008-07-18 11:11:28] | [nan, [science, humanities, mathematics, techn... | [nan, [journal_articles, conference_and_worksh... | [nan, [{'name': 'king fahd university of petro... | [nan, []] | [nan, {"name": "eprints", "version": ""}] | [nan, http://eprints.kfupm.edu.sa/cgi/oai2] | [nan, yes] | [nan, nan] | [nan, 4890.0] | [nan, 6221.0] | [nan, OpenDOAR_1285] | [755, nan] | [511, nan] | [archive, nan] | [1, nan] | [nan, nan] | [nan, nan] | [disk0/00/00/07/55, nan] | [2010-01-06 13:44:43, nan] | [2011-07-18 05:50:28, nan] | [2010-01-06 13:44:43, nan] | [institutional, nan] | [nan, nan] | [nan, nan] | [show, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [0, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://eprints.kfupm.edu.sa/, nan] | [KFUPM ePrints, nan] | [http://eprints.kfupm.edu.sa/perl/oai2, nan] | [nan, nan] | [http://eprints.kfupm.edu.sa/cgi/latest_tool?o... | [nan, nan] | [nan, nan] | [TRUE, nan] | [TRUE, nan] | [nan, nan] | [King Fahd University of Petroleum and Mineral... | [http://www.kfupm.edu.sa, nan] | [sa, nan] | [ DHAHRAN, nan] | [17.4333, nan] | [43.2167, nan] | [eprints, nan] | [geoname_2_SA, nan] | [3.2.7, nan] | [nan, nan] | [2007-09-10 11:33:13, nan] | [nan, nan] | [nan, nan] | [0, nan] | [0, nan] | [0, nan] | [100, nan] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [[celestial, opendoar], nan] | [[1234, 1285], nan] | [nan, nan] | [nan, nan] | [nan, nan] | [37, nan] | [26, nan] | [69, nan] | [23, nan] | [203, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_755, nan] | {OpenDOAR, roar} |
In [39]:
def remove_nan(list_obj):
if isinstance(list_obj, list):
while np.nan in list_obj:
list_obj.remove(np.nan)
return list_obj
dup_within.applymap(remove_nan).to_csv('../data/processed/dup_within.csv')
dup_hybrid.applymap(remove_nan).to_csv('../data/processed/dup_hybrid.csv')
dup_across.applymap(remove_nan).to_csv('../data/processed/dup_across.csv')