696 KiB
696 KiB
In [1]:
import ast
import csv
import json
import glom
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading data from registries¶
In [2]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
lines = f.read().splitlines()
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))
fairsharing_df['unique_id'] = 'FAIRsharing_' + fairsharing_df.id
fairsharing_df = fairsharing_df.add_prefix('FAIRsharing_')
fairsharing_df.head()
Out[2]:
FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1723 | fairsharing-records | 2014-11-04T15:23:40.000Z | 2021-09-30T11:39:06.829Z | 10.25504/FAIRsharing.8t18te | Cell Image Library | ready | [{'contact-name': 'David Orloff', 'contact-ema... | http://www.cellimagelibrary.org | 1723 | This library is a public and easily accessible... | [{'url': 'http://www.cellimagelibrary.org/page... | 2010.0 | [{'name': 'live update', 'type': 'data release... | [biodbcore-000180, bsg-d000180] | Database | repository | [Cell Biology, Life Science] | [Cell, Microscopy, Light microscopy, Electron ... | [All] | [] | [United States] | FAIRsharing record for: Cell Image Library | None | https://fairsharing.org/10.25504/FAIRsharing.8... | 10.25504/FAIRsharing.8t18te | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: This librar... | [{'id': 232, 'pubmed_id': 23203874, 'title': '... | [{'licence-name': 'Cell Image Library Data Pol... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_1723 |
1 | 3101 | fairsharing-records | 2020-09-16T08:49:13.000Z | 2021-09-30T11:36:45.452Z | NaN | WHOI Ship Data-Grabber System | ready | NaN | http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html | 3101 | The WHOI Ship DataGrabber system provides the ... | [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... | 2004.0 | [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... | [biodbcore-001609, bsg-d001609] | Database | repository | [Earth Science, Water Research, Oceanography] | [] | [Not applicable] | [subseafloor environments] | [United States] | FAIRsharing record for: WHOI Ship Data-Grabber... | None | https://fairsharing.org/fairsharing_records/3101 | None | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: The WHOI Sh... | [] | [{'licence-name': 'NDSF Data Archive Policy', ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_3101 |
2 | 2649 | fairsharing-records | 2018-08-07T20:23:32.000Z | 2021-09-30T11:39:07.898Z | NaN | Electron Microscope Public Image Archive | ready | [{'contact-name': 'General contact', 'contact-... | https://www.ebi.ac.uk/pdbe/emdb/empiar/ | 2649 | EMPIAR, the Electron Microscopy Public Image A... | [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... | 2015.0 | [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | [biodbcore-001140, bsg-d001140] | Database | repository | [Bioinformatics, Biology] | [Protein image, Microscopy, Electron microscop... | [All] | [] | [Greece, Czech Republic, United Kingdom, Icela... | FAIRsharing record for: Electron Microscope Pu... | EMPIAR | https://fairsharing.org/fairsharing_records/2649 | None | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: EMPIAR, the... | [{'id': 2232, 'pubmed_id': 27067018, 'title': ... | [{'licence-name': 'EMBL-EBI Terms of Use', 'li... | [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... | EMPIAR | [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | NaN | NaN | NaN | FAIRsharing_2649 |
3 | 2657 | fairsharing-records | 2018-08-13T15:12:11.000Z | 2021-09-30T11:37:28.736Z | 10.25504/FAIRsharing.tnByoG | ClinicalStudyDataRequest.com | ready | [{'contact-email': 'support@clinicalstudydatar... | https://clinicalstudydatarequest.com/ | 2657 | ClinicalStudyDataRequest.com (CSDR) is a conso... | [{'url': 'https://clinicalstudydatarequest.com... | 2014.0 | [{'url': 'https://clinicalstudydatarequest.com... | [biodbcore-001149, bsg-d001149] | Database | repository | [Preclinical Studies, Biomedical Science] | [] | [Homo sapiens] | [] | [Worldwide] | FAIRsharing record for: ClinicalStudyDataReque... | CSDR | https://fairsharing.org/10.25504/FAIRsharing.t... | 10.25504/FAIRsharing.tnByoG | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: ClinicalStu... | [] | [{'licence-name': 'CSDR Data Sharing Agreement... | NaN | CSDR | NaN | NaN | NaN | NaN | NaN | FAIRsharing_2657 |
4 | 2078 | fairsharing-records | 2014-11-04T15:23:40.000Z | 2021-09-30T11:34:43.129Z | 10.25504/FAIRsharing.3axym7 | Germplasm Resources Information Network | ready | [{'contact-email': 'dbmu@ars-grin.gov'}] | https://www.ars-grin.gov/ | 2078 | GRIN provides National Genetic Resources Progr... | [{'url': 'https://www.ars-grin.gov/Pages/Colle... | 2010.0 | [{'url': 'https://www.ars-grin.gov/', 'name': ... | [biodbcore-000546, bsg-d000546] | Database | repository | [Life Science] | [Cell, Cell culture, Germplasm] | [Bacteria, Metazoa, Viridiplantae] | [] | [United States] | FAIRsharing record for: Germplasm Resources In... | GRIN | https://fairsharing.org/10.25504/FAIRsharing.3... | 10.25504/FAIRsharing.3axym7 | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: GRIN provid... | [] | [] | NaN | GRIN | NaN | NaN | NaN | NaN | NaN | FAIRsharing_2078 |
In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additionalName': ast.literal_eval,
'repositoryIdentifier': ast.literal_eval,
'type': ast.literal_eval,
'contentType': ast.literal_eval,
'providerType': ast.literal_eval,
'institution': ast.literal_eval
})
re3data_df['unique_id'] = 're3data_' + re3data_df.orgIdentifier
re3data_df = re3data_df.add_prefix('re3data_')
re3data_df.head()
Out[3]:
re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | r3d100000001 | Odum Institute Archive Dataverse | eng | [] | https://dataverse.unc.edu/dataverse/odum | [] | ["https://dataverse.unc.edu/dataverse/odum#", ... | The Odum Institute Archive Dataverse contains ... | eng | [disciplinary] | {"size": "13 dataverses; 3.050 datasets", "upd... | NaN | NaN | ["eng"] | [1 Humanities and Social Sciences, 111 Social ... | NaN | [Databases, Plain text, Scientific and statist... | [dataProvider] | [FAIR, Middle East, crime, demography, economy... | [{'institutionName': 'Odum Institute for Resea... | [{"policyName": "Collection Development Policy... | {"databaseAccessType": "open", "databaseAcces... | [{"databaseLicenseName": "CC0", "databaseLicen... | [{"dataAccessType": "embargoed", "dataAccessRe... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [] | ["DataVerse"] | NaN | [] | ["DOI"] | NaN | [] | unknown | yes | ["other"] | [{"metadataStandardName": "DDI - Data Document... | {} | Odum Dataverse is covered by Thomson Reuters D... | 2013-06-10 | 2021-07-06 | re3data_r3d100000001 |
1 | r3d100000002 | Access to Archival Databases | eng | [{'additionalName': 'AAD', 'additionalNameLang... | https://aad.archives.gov/aad/ | [RRID:SCR_010479, RRID:nlx_157752] | ["https://www.archives.gov/contact"] | You will find in the Access to Archival Databa... | eng | [disciplinary] | {"size": "", "updatedp": ""} | 1985 | NaN | ["eng", "spa"] | [1 Humanities and Social Sciences, 102 History... | https://www.archives.gov/publications/general-... | [Images, Standard office documents, Structured... | [dataProvider] | [US History] | [{'institutionName': 'The U.S. National Archiv... | [{"policyName": "Contribution Policy", "policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "Copyrights", "dataLicens... | restricted | [] | ["unknown"] | no | ["https://www.archives.gov/developer#toc-appli... | ["none"] | https://aad.archives.gov/aad/help/getting-star... | [] | unknown | unknown | [] | [] | {"syndication": "http://www.archives.gov/socia... | NaN | 2012-07-04 | 2021-05-25 | re3data_r3d100000002 |
2 | r3d100000004 | Datenbank Gesprochenes Deutsch | deu | [{'additionalName': 'DGD', 'additionalNameLang... | https://dgd.ids-mannheim.de/ | [] | ["dgd@ids-mannheim.de"] | The "Database for Spoken German (DGD)" is a co... | eng | [disciplinary] | {"size": "34 corpora", "updatedp": "2020-02-03"} | 2012 | NaN | ["deu"] | [1 Humanities and Social Sciences, 104 Linguis... | https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... | [Audiovisual data, Standard office documents, ... | [dataProvider, serviceProvider] | [Australian German, FOLK, German dialects, Pfe... | [{'institutionName': 'Institut für Deutsche Sp... | [{"policyName": "Erfurter Aufruf zur Sicherung... | {"databaseAccessType": "restricted", "databas... | [] | [{"dataAccessType": "restricted", "dataAccessR... | [{"dataLicenseName": "other", "dataLicenseURL"... | restricted | [] | ["other"] | yes | [] | ["none"] | http://agd.ids-mannheim.de/konditionen.shtml | [] | unknown | unknown | ["RatSWD"] | [] | {} | NaN | 2012-07-20 | 2020-08-27 | re3data_r3d100000004 |
3 | r3d100000005 | UNC Dataverse | eng | [{'additionalName': 'University of North Carol... | https://dataverse.unc.edu/ | [] | ["https://dataverse.unc.edu/", "odumarchive@un... | UNC Dataverse is an open-source repository sof... | eng | [institutional] | {"size": "186 dataverses; 25.272 studies; 229.... | 2011 | NaN | ["eng"] | [1 Humanities and Social Sciences, 111 Social ... | https://odum.unc.edu/about/mission-vision/ | [Archived data, Plain text, Raw data, Scientif... | [dataProvider, serviceProvider] | [FAIR, census, demographic survey, demography,... | [{'institutionName': 'Odum Institute for Resea... | [{"policyName": "Collection Development Policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Data Deposit Form"... | ["DataVerse"] | yes | ["https://guides.dataverse.org/en/latest/api/n... | ["ARK", "DOI", "PURL", "URN", "hdl"] | https://dataverse.org/best-practices/data-cita... | [] | unknown | yes | [] | [{"metadataStandardName": "DDI - Data Document... | {} | UNC Dataverse is covered by Clarivate Data Cit... | 2012-07-23 | 2021-08-11 | re3data_r3d100000005 |
4 | r3d100000006 | Archaeology Data Service | eng | [{'additionalName': 'ADS', 'additionalNameLang... | https://archaeologydataservice.ac.uk/ | [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] | ["help@archaeologydataservice.ac.uk", "https:/... | The ADS is an accredited digital repository fo... | eng | [disciplinary] | {"size": "1837 results", "updatedp": "2020-05-... | 1996-10-01 | NaN | ["eng"] | [1 Humanities and Social Sciences, 101 Ancient... | https://archaeologydataservice.ac.uk/about/our... | [Archived data, Audiovisual data, Databases, I... | [dataProvider, serviceProvider] | [FAIR, archaeology, cultural heritage, prehist... | [{'institutionName': 'Arts and Humanities Rese... | [{"policyName": "ADS Guides to good practice",... | {"databaseAccessType": "open", "databaseAcces... | [{"databaseLicenseName": "CC", "databaseLicens... | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Guidelines for Dep... | ["other"] | yes | ["https://archaeologydataservice.ac.uk/about/e... | ["DOI"] | https://archaeologydataservice.ac.uk/advice/te... | [] | unknown | yes | ["other"] | [{"metadataStandardName": "DataCite Metadata S... | {"syndication": "https://archaeologydataservic... | ADS is covered by Clarivate Data Citation Inde... | 2012-07-23 | 2021-09-02 | re3data_r3d100000006 |
In [4]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
converters={'repository_metadata.content_subjects_phrases': ast.literal_eval,
'repository_metadata.alternativename': ast.literal_eval,
'repository_metadata.content_types': ast.literal_eval,
'organization': ast.literal_eval
},
dtype={'system_metadata.id': str})
opendoar_df['unique_id'] = 'OpenDOAR_' + opendoar_df['system_metadata.id']
opendoar_df = opendoar_df.add_prefix('OpenDOAR_')
opendoar_df.head()
Out[4]:
OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects_phrases | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 175 | {"name": "hku theses online", "language": "en"} | [] | http://hub.hku.hk/handle/10722/1057 | this is an institutional repository providing ... | institutional | ["zh", "en"] | 2021-03-25 10:16:18 | 2005-12-21 12:44:08 | [multidisciplinary] | [bibliographic_references, theses_and_disserta... | [{'name': 'university of hong kong', 'alternat... | [] | {"name": "dspace", "version": "cris-5.3.1-snap... | NaN | yes | OpenDOAR_175 |
1 | 64 | {"name": "research support scheme - central eu... | [] | http://rss.archives.ceu.hu/ | this is an institutional repository collecting... | institutional | ["cs", "en", "hu", "ru"] | 2021-03-25 09:48:31 | 2006-01-04 14:59:30 | [multidisciplinary] | [unpub_reports_and_working_papers] | [{'name': 'central european university', 'alte... | [] | {"name": "eprints", "version": "2.2.1"} | http://rss.archives.ceu.hu/perl/oai2 | yes | OpenDOAR_64 |
2 | 151 | {"name": "cadmus, eui research repository", "l... | [] | http://cadmus.eui.eu/ | cadmus is the name of the eui research reposit... | institutional | ["nl", "en", "fr", "de", "it"] | 2021-09-13 13:35:36 | 2006-01-04 12:07:07 | [history and archaeology, multidisciplinary, s... | [journal_articles, theses_and_dissertations, u... | [{'name': 'european university institute', 'al... | [{"policy_url": "https://www.eui.eu/research/e... | {"name": "dspace", "version": "5.2"} | http://cadmus.eui.eu/oai/request | yes | OpenDOAR_151 |
3 | 105 | {"name": "document server@uhasselt", "language... | [] | https://doclib.uhasselt.be/dspace/ | this site is a university repository providing... | institutional | ["nl", "en", "fr", "de"] | 2021-04-16 15:23:52 | 2006-01-24 15:46:44 | [multidisciplinary] | [journal_articles, conference_and_workshop_pap... | [{'name': 'uhasselt', 'alternativeName': 'hass... | [] | {"name": "dspace", "version": "1.7.2"} | http://doclib.uhasselt.be/dspace-oai/request | yes | OpenDOAR_105 |
4 | 101 | {"name": "utrecht university repository", "lan... | [] | http://dspace.library.uu.nl | this site is a university repository providing... | institutional | ["nl", "en"] | 2021-04-16 15:22:03 | 2006-01-13 12:55:13 | [multidisciplinary] | [journal_articles, conference_and_workshop_pap... | [{'name': 'university of utrecht', 'alternativ... | [] | {"name": "dspace", "version": ""} | https://dspace.library.uu.nl/oai/request | yes | OpenDOAR_101 |
In [5]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)
def value_or_list(cell_set):
copy = set(cell_set)
copy.discard(np.nan)
if len(copy) == 0:
return np.nan
if len(copy) == 1:
return copy.pop()
return list(copy)
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)
roar_df['unique_id'] = 'roar_' + roar_df.eprintid
roar_df = roar_df.add_prefix('roar_')
roar_df.head()
Out[5]:
roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 633 | archive | 1 | NaN | NaN | disk0/00/00/00/01 | 2010-01-06 13:43:48 | 2011-07-18 05:40:07 | 2010-01-06 13:43:48 | subject | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://archivesic.ccsd.cnrs.fr/ | @RCHIVESIC | http://archivesic.ccsd.cnrs.fr/oai/oai.php | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | fr | NaN | NaN | NaN | hal | geoname_2_FR | other | NaN | 2002-05-17 19:24:41 | NaN | NaN | 0 | 0 | 0 | 25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... | NaN | NaN | NaN | NaN | [opendoar, celestial] | [669, 58] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_1 |
1 | 10 | 511 | archive | 1 | NaN | NaN | disk0/00/00/00/10 | 2010-01-06 13:43:48 | 2011-07-18 05:40:13 | 2010-01-06 13:43:48 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://www.diva-portal.org/mdh/ | Academic Archive On-line (Mälardalen Universit... | http://www.diva-portal.org/oai/mdh/OAI | NaN | NaN | NaN | NaN | TRUE | TRUE | NaN | NaN | NaN | se | Uppsala | 59.8667 | 17.6333 | diva | geoname_2_SE | other | NaN | 2005-12-08 13:15:22 | NaN | NaN | 0 | 0 | 0 | 100 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,100... | NaN | NaN | NaN | NaN | [opendoar, celestial] | [526, 258] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_10 |
2 | 1000 | 274 | archive | 1 | NaN | NaN | disk0/00/00/10/00 | 2010-01-06 13:45:01 | 2011-07-06 08:21:21 | 2010-01-06 13:45:01 | subject | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://pam.pisharp.org/ | PAM - Portuguese Archive of Mathematics | NaN | NaN | NaN | NaN | NaN | TRUE | TRUE | NaN | NaN | NaN | pt | Bellevue, WA | 47.6034 | -122.155 | dspace | geoname_2_PT | other | NaN | 2006-05-04 10:48:14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_1000 |
3 | 10001 | 20 | archive | 91 | NaN | NaN | disk0/00/01/00/01 | 2015-08-08 14:52:11 | 2016-03-21 19:44:01 | 2015-08-08 14:52:11 | subject | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://edoc.sub.uni-hamburg.de/klimawandel/ | Klimawandel Dokumentenserver | http://edoc.sub.uni-hamburg.de/klimawandel/oai | NaN | NaN | NaN | The "Documentenserver Klimawandel" (Repository... | TRUE | TRUE | TRUE | [Climate Service Center 2.0, Helmholtz-Zentrum... | [http://www.hzg.de/, http://www.klimzug.de/de/... | de | Hamburg | 53.5511 | 9.9937 | opus | geoname_2_DE | other | [GE, GF, G1, S1, HD] | 2015-07-02 08:08:31 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [opendoar, celestial] | [3408, 5881] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_10001 |
4 | 10008 | 11 | archive | 404 | NaN | NaN | disk0/00/01/00/08 | 2015-08-08 14:52:26 | 2016-03-21 19:43:51 | 2015-08-08 14:52:26 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://creativematter.skidmore.edu/ | Creative Matter | Skidmore College Research | http://creativematter.skidmore.edu/do/oai/ | NaN | http://creativematter.skidmore.edu/recent.rss | NaN | Welcome to Creative Matter, a repository for t... | TRUE | FALSE | FALSE | Skidmore College | http://www.skidmore.edu/ | us | Saratoga Springs | 43.0961 | -73.7818 | bepress | geoname_2_US | other | NaN | 2015-07-06 17:35:50 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | celestial | 5882 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_10008 |
In [6]:
roar_df[roar_df.roar_eprintid == '10013']
Out[6]:
roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | 10013 | 31 | archive | 7104 | NaN | NaN | disk0/00/01/00/13 | 2015-08-08 14:53:04 | 2016-03-21 19:54:43 | 2015-08-08 14:53:04 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://er.ucu.edu.ua/ | ErUCU: Electronic repository of the Ukrainian ... | http://er.ucu.edu.ua/oai/request | http://er.ucu.edu.ua/sword/ | http://er.ucu.edu.ua/feed/rss_2.0/site | NaN | Ukrainian Catholic University’s institutional ... | TRUE | TRUE | TRUE | Ukrainian Catholic University | http://ucu.edu.ua/eng/ | ua | Lviv | NaN | NaN | dspace | geoname_2_UA | other | [H1, L1, AC, D204, B1, D1, DK, BF, BS, HM, BL,... | 2015-07-07 12:38:37 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [opendoar, celestial] | [3410, 5883] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [russell_group, ivy_league] | roar_10013 |
Loading dedup results¶
In [7]:
dup = pd.read_csv('../data/interim/fairsharing_dedup.csv', sep=';', quotechar='"', header=None, names=['dedup_id', 'duplicate_id', 'original_id', 'name', 'source'])
dup['unique_id'] = dup.source + '_' + dup.original_id
dup.head()
Out[7]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
0 | dedup::860320be12a1c050cd7731794e231bd3 | opendoar____::2290a7385ed77cc5592dc2153229f082 | 1064 | oxford university research archive | OpenDOAR | OpenDOAR_1064 |
1 | dedup::1aa7a8773e6a7fdacbcedf9999009a38 | opendoar____::191f8f858acda435ae0daf994e2a72c2 | 8648 | digital commons@georgia southern | OpenDOAR | OpenDOAR_8648 |
2 | dedup::31bceb0c3e2a260593e1e36655ebcee4 | opendoar____::d5776aeecb3c45ab15adce6f5cb355f3 | 9713 | materials data repository | OpenDOAR | OpenDOAR_9713 |
3 | dedup::e37b08dd3015330dcbb5d6663667b8b8 | opendoar____::18997733ec258a9fcaf239cc55d53363 | 427 | digital repository at the university of maryland | OpenDOAR | OpenDOAR_427 |
4 | dedup::2841194266115ac1cc04d19630cde46b | re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5 | r3d100011189 | PRISM: University of Calgary's Digital Repository | re3data | re3data_r3d100011189 |
In [8]:
dup.describe()
Out[8]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
count | 4617 | 4617 | 4617 | 4617 | 4617 | 4617 |
unique | 2191 | 4617 | 4159 | 3968 | 4 | 4617 |
top | dedup::75e33da9b103b7b91dcd8da0abe1354b | opendoar____::2290a7385ed77cc5592dc2153229f082 | 2399 | UPN JATIM REPOSITORY | roar | OpenDOAR_1064 |
freq | 5 | 1 | 3 | 4 | 1977 | 1 |
Assessing duplicates across registries¶
In [9]:
dup_grouped = dup.groupby('dedup_id').aggregate(list)
dup_grouped['source_set'] = dup_grouped.source.map(set)
In [10]:
dup_grouped[dup_grouped.source_set.str.len() == 4].count()
Out[10]:
duplicate_id 6 original_id 6 name 6 source 6 unique_id 6 source_set 6 dtype: int64
In [11]:
dup_grouped[dup_grouped.source_set.str.len() == 3].count()
Out[11]:
duplicate_id 60 original_id 60 name 60 source 60 unique_id 60 source_set 60 dtype: int64
In [12]:
dup_grouped[dup_grouped.source_set.str.len() == 2].count()
Out[12]:
duplicate_id 1986 original_id 1986 name 1986 source 1986 unique_id 1986 source_set 1986 dtype: int64
In [13]:
dup_grouped[dup_grouped.source_set.str.len() == 1].count()
Out[13]:
duplicate_id 139 original_id 139 name 139 source 139 unique_id 139 source_set 139 dtype: int64
Assessing duplicates within registries¶
In [65]:
opendoar_dup = dup[dup.source == 'OpenDOAR'].groupby('dedup_id').count()
opendoar_dup[opendoar_dup.duplicate_id > 1].aggregate(['count', 'sum'])
Out[65]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
count | 28 | 28 | 28 | 28 | 28 |
sum | 58 | 58 | 58 | 58 | 58 |
In [64]:
re3data_dup = dup[dup.source == 're3data'].groupby('dedup_id').count()
re3data_dup[re3data_dup.duplicate_id > 1].aggregate(['count', 'sum'])
Out[64]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
count | 3 | 3 | 3 | 3 | 3 |
sum | 6 | 6 | 6 | 6 | 6 |
In [63]:
roar_dup = dup[dup.source == 'roar'].groupby('dedup_id').count()
roar_dup[roar_dup.duplicate_id > 1].aggregate(['count', 'sum'])
Out[63]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
count | 249 | 249 | 249 | 249 | 249 |
sum | 518 | 518 | 518 | 518 | 518 |
In [53]:
fairsharing_dup = dup[dup.source == 'FAIRsharing'].groupby('dedup_id').count()
fairsharing_dup[fairsharing_dup.duplicate_id > 1].count()
Out[53]:
duplicate_id 0 original_id 0 name 0 source 0 unique_id 0 dtype: int64
Isolating duplicates within a registry¶
In [14]:
dup_within = dup.groupby('dedup_id').aggregate(list)
dup_within['source_set'] = dup_within.source.map(set)
dup_within = dup_within[dup_within.source_set.str.len() == 1]
dup_within.head()
Out[14]:
duplicate_id | original_id | name | source | unique_id | source_set | |
---|---|---|---|---|---|---|
dedup_id | ||||||
dedup::000871c1fc726f0b52dc86a4eeb027de | [4612, 4649] | [4612, 4649] | [IIT Bombay Institutional Repository, IIT Bomb... | [roar, roar] | [roar_4612, roar_4649] | {roar} |
dedup::0163cceb20f5ca7b313419c068abd9dc | [7943, 8003] | [7943, 8003] | [EPrints@NIRT Library Welcomes! - EPrints@NITR... | [roar, roar] | [roar_7943, roar_8003] | {roar} |
dedup::028ee724157b05d04e7bdcf237d12e60 | [2670, 2698, 2741] | [2670, 2698, 2741] | [HSF Brage Open Research Archive, HSF Brage Op... | [roar, roar, roar] | [roar_2670, roar_2698, roar_2741] | {roar} |
dedup::03593ce517feac573fdaafa6dcedef61 | [4393, 4394] | [4393, 4394] | [Institutional Repository of Kunming Institute... | [roar, roar] | [roar_4393, roar_4394] | {roar} |
dedup::03e0704b5690a2dee1861dc3ad3316c9 | [1019, 5550] | [1019, 5550] | [PolyU Institutional Repository, PolyU Institu... | [roar, roar] | [roar_1019, roar_5550] | {roar} |
In [15]:
dup_within['source_set'] = dup_within.source_set.map(set.pop)
dup_within.head()
Out[15]:
duplicate_id | original_id | name | source | unique_id | source_set | |
---|---|---|---|---|---|---|
dedup_id | ||||||
dedup::000871c1fc726f0b52dc86a4eeb027de | [4612, 4649] | [4612, 4649] | [IIT Bombay Institutional Repository, IIT Bomb... | [roar, roar] | [roar_4612, roar_4649] | roar |
dedup::0163cceb20f5ca7b313419c068abd9dc | [7943, 8003] | [7943, 8003] | [EPrints@NIRT Library Welcomes! - EPrints@NITR... | [roar, roar] | [roar_7943, roar_8003] | roar |
dedup::028ee724157b05d04e7bdcf237d12e60 | [2670, 2698, 2741] | [2670, 2698, 2741] | [HSF Brage Open Research Archive, HSF Brage Op... | [roar, roar, roar] | [roar_2670, roar_2698, roar_2741] | roar |
dedup::03593ce517feac573fdaafa6dcedef61 | [4393, 4394] | [4393, 4394] | [Institutional Repository of Kunming Institute... | [roar, roar] | [roar_4393, roar_4394] | roar |
dedup::03e0704b5690a2dee1861dc3ad3316c9 | [1019, 5550] | [1019, 5550] | [PolyU Institutional Repository, PolyU Institu... | [roar, roar] | [roar_1019, roar_5550] | roar |
In [16]:
dup_within.groupby('source_set').count()
Out[16]:
duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|
source_set | |||||
OpenDOAR | 16 | 16 | 16 | 16 | 16 |
re3data | 2 | 2 | 2 | 2 | 2 |
roar | 121 | 121 | 121 | 121 | 121 |
In [17]:
dup_within = dup[dup.dedup_id.isin(dup_within.index)]
dup_within
Out[17]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
28 | dedup::d2ddea18f00665ce8623e36bd4e3c7c5 | 8237 | 8237 | AIR | Archivio Istituzionale della Ricerca | roar | roar_8237 |
31 | dedup::4c5bcfec8584af0d967f1ab10179ca4b | 2820 | 2820 | USU Repository: Open Access Repository | roar | roar_2820 |
46 | dedup::c2ae5cb2426d96ed19a50b0b7d7c8e11 | 9487 | 9487 | IR at NRF: Home | roar | roar_9487 |
53 | dedup::1c65cef3dfd1e00c0b03923a1c591db4 | 1241 | 1241 | Swansea Metropolitan University Repository | roar | roar_1241 |
59 | dedup::4217ec5d78c4bc4e5bd006783482441f | 15142 | 15142 | Repositorio Institucional | roar | roar_15142 |
... | ... | ... | ... | ... | ... | ... |
4560 | dedup::fc394e9935fbd62c8aedc372464e1965 | 7161 | 7161 | Welcome to IR@NPL | roar | roar_7161 |
4586 | dedup::000871c1fc726f0b52dc86a4eeb027de | 4649 | 4649 | IIT Bombay Institutional Repository | roar | roar_4649 |
4587 | dedup::72c288a828485e5b1d4c52910d106734 | 16867 | 16867 | Chung Shan Medical University Institutional Re... | roar | roar_16867 |
4598 | dedup::0163cceb20f5ca7b313419c068abd9dc | 8003 | 8003 | EPrints@NIRT Library Welcomes! - EPrints@NIRT | roar | roar_8003 |
4608 | dedup::2aeb1a8f8475cef63900be5d0780e872 | 15471 | 15471 | Repository STIE Nobel Indonesia | roar | roar_15471 |
287 rows × 6 columns
Isolating duplicates across registries (hybrid)¶
In [18]:
dup_across = dup[~dup.dedup_id.isin(dup_within.dedup_id)]
dup_across = dup_across.groupby('dedup_id').aggregate(list)
dup_across['source_set'] = dup_across.source.map(set)
dup_hybrid = dup_across[dup_across.source_set.str.len() < dup_across.source.str.len()]
dup_hybrid = dup[dup.dedup_id.isin(dup_hybrid.index)]
dup_hybrid
Out[18]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
12 | dedup::471c50ad1a156d7256eddfd747d77931 | opendoar____::6351bf9dce654515bf1ddbd6426dfa97 | 1996 | ehtc repositorio institucional | OpenDOAR | OpenDOAR_1996 |
21 | dedup::69dafe8b58066478aea48f3d0f384820 | 2312 | 2312 | Göteborgs universitets publikationer - e-publi... | roar | roar_2312 |
26 | dedup::8f822ac814829da24a7065b8131bdf47 | opendoar____::a34bacf839b923770b2c360eefa26748 | 1035 | kitami institute of technology repository | OpenDOAR | OpenDOAR_1035 |
41 | dedup::63a99723ebb3af94d52b474c3b21dbe1 | 5779 | 5779 | Sanok Digital Library | roar | roar_5779 |
47 | dedup::82680bfec0fa08346c1b10d30a3e3d4a | 11212 | 11212 | Publication Server of the Wuppertal Institute | roar | roar_11212 |
... | ... | ... | ... | ... | ... | ... |
4601 | dedup::7810ccd41bf26faaa2c4e1f20db70a71 | 3172 | 3172 | Tesis Electrónicas UACh | roar | roar_3172 |
4602 | dedup::e655c7716a4b3ea67f48c6322fc42ed6 | opendoar____::52c5189391854c93e8a0e1326e56c14f | 1637 | vtext digital repository | OpenDOAR | OpenDOAR_1637 |
4603 | dedup::5ebe5626b9f1cd89fbb9f665a527591f | 16225 | 16225 | Necmettin Erbakan University Institutional Rep... | roar | roar_16225 |
4605 | dedup::ec0bfd000f253eff3acb1043e1c06979 | opendoar____::aa2a77371374094fe9e0bc1de3f94ed9 | 1829 | npue ir | OpenDOAR | OpenDOAR_1829 |
4610 | dedup::1c7836dbabd12c458d20e3b35633733a | 14616 | 14616 | SOAR@USA: Scholarship and Open Access Repository | roar | roar_14616 |
440 rows × 6 columns
Isolating duplicates across registries (pure)¶
In [19]:
dup_across = dup_across[dup_across.source_set.str.len() == dup_across.source.str.len()]
dup_across = dup[dup.dedup_id.isin(dup_across.index)]
dup_across
# dup[dup.dedup_id.isin(dup_across.index)]
Out[19]:
dedup_id | duplicate_id | original_id | name | source | unique_id | |
---|---|---|---|---|---|---|
0 | dedup::860320be12a1c050cd7731794e231bd3 | opendoar____::2290a7385ed77cc5592dc2153229f082 | 1064 | oxford university research archive | OpenDOAR | OpenDOAR_1064 |
1 | dedup::1aa7a8773e6a7fdacbcedf9999009a38 | opendoar____::191f8f858acda435ae0daf994e2a72c2 | 8648 | digital commons@georgia southern | OpenDOAR | OpenDOAR_8648 |
2 | dedup::31bceb0c3e2a260593e1e36655ebcee4 | opendoar____::d5776aeecb3c45ab15adce6f5cb355f3 | 9713 | materials data repository | OpenDOAR | OpenDOAR_9713 |
3 | dedup::e37b08dd3015330dcbb5d6663667b8b8 | opendoar____::18997733ec258a9fcaf239cc55d53363 | 427 | digital repository at the university of maryland | OpenDOAR | OpenDOAR_427 |
4 | dedup::2841194266115ac1cc04d19630cde46b | re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5 | r3d100011189 | PRISM: University of Calgary's Digital Repository | re3data | re3data_r3d100011189 |
... | ... | ... | ... | ... | ... | ... |
4612 | dedup::5ef0b4eba35ab2d6180b0bca7e46b6f9 | 475 | 475 | Ecological Restoration Institute - Northern Ar... | roar | roar_475 |
4613 | dedup::66e8d052ec2230c66bd11ee6b5a0e3c8 | 14199 | 14199 | Repositori STKIP PGRI Sumenep | roar | roar_14199 |
4614 | dedup::1216a1bca4361c39d1d77965c5d95ee3 | 4960 | 4960 | Virtual Archive of Polish Armenians | roar | roar_4960 |
4615 | dedup::1408358fe6a7f9327dd41a5651ac284c | 13824 | 13824 | Digital Commons @ New Jersey Institute of Tech... | roar | roar_13824 |
4616 | dedup::5cc33dfe7e069a757ca0fcbe6b95c89e | opendoar____::d8a4e572d866aa45da78418d9d2ff9f9 | 4351 | odu digital commons | OpenDOAR | OpenDOAR_4351 |
3890 rows × 6 columns
Double check partitions
In [20]:
dup.count()
Out[20]:
dedup_id 4617 duplicate_id 4617 original_id 4617 name 4617 source 4617 unique_id 4617 dtype: int64
In [21]:
dup_across.count() + dup_within.count() + dup_hybrid.count()
Out[21]:
dedup_id 4617 duplicate_id 4617 original_id 4617 name 4617 source 4617 unique_id 4617 dtype: int64
In [22]:
dup_within.groupby('dedup_id').ngroups + dup_across.groupby('dedup_id').ngroups + dup_hybrid.groupby('dedup_id').ngroups
Out[22]:
2191
In [23]:
dup.groupby('dedup_id').ngroups
Out[23]:
2191
Joining information¶
In [24]:
dup_within = dup_within.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_within = dup_within.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_within = dup_within.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_within = dup_within.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_within.head()
Out[24]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects_phrases | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::d2ddea18f00665ce8623e36bd4e3c7c5 | 8237 | 8237 | AIR | Archivio Istituzionale della Ricerca | roar | roar_8237 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8237 | 17 | archive | 5268 | NaN | NaN | disk0/00/00/82/37 | 2014-05-15 11:23:30 | 2014-05-19 05:42:47 | 2014-05-15 11:23:30 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://air.unimi.it | AIR | Archivio Istituzionale della Ricerca | http://air.unimi.it/dspace-oai/request | NaN | NaN | NaN | AIR (Archivio Istituzionale della ricerca) is ... | FALSE | FALSE | TRUE | Università degli Studi di Milano | http://www.unimi.it | it | Milan | 45.46 | 9.1947 | dspace | geoname_2_IT | other | NaN | 2014-05-04 17:40:53 | NaN | NaN | 0 | 0 | 0 | 99 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,6... | NaN | NaN | NaN | NaN | celestial | 1596 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_8237 |
1 | dedup::4c5bcfec8584af0d967f1ab10179ca4b | 2820 | 2820 | USU Repository: Open Access Repository | roar | roar_2820 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2820 | 525 | archive | 65 | NaN | NaN | disk0/00/00/28/20 | 2010-07-29 01:40:27 | 2012-01-19 11:37:49 | 2010-07-29 01:40:27 | institutional | 2372 | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://repository.usu.ac.id | USU Repository: Open Access Repository | http://repository.usu.ac.id/oai/request | NaN | http://repository.usu.ac.id/feed/rss_2.0/site | NaN | Comprises of works by and/or about the univers... | TRUE | TRUE | FALSE | [USU Library, University of Sumatera Utara] | [http://library.usu.ac.id, http://www.usu.ac.id] | id | Medan | 3.5595 | 98.6572 | dspace | geoname_2_ID | other | NaN | 2010-01-15 10:09:25 | NaN | NaN | 0 | 0 | 0 | 100 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,52,... | NaN | NaN | NaN | NaN | [roarmap, opendoar, celestial] | [283, 1717, 2101] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_2820 |
2 | dedup::c2ae5cb2426d96ed19a50b0b7d7c8e11 | 9487 | 9487 | IR at NRF: Home | roar | roar_9487 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9487 | 16 | archive | 6458 | NaN | NaN | disk0/00/00/94/87 | 2015-05-15 14:03:55 | 2016-03-21 20:21:02 | 2015-05-15 14:03:55 | multi | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://ir.nrf.ac.za/ | IR at NRF: Home | NaN | NaN | NaN | NaN | The NRF receives its mandate from the National... | TRUE | TRUE | FALSE | National Research Foundation of South Africa | http://www.nrf.ac.za/ | za | Pretoria | NaN | NaN | dspace | geoname_2_ZA | other | [B1, AS, AI] | 2015-02-10 06:35:50 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roarmap | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_9487 |
3 | dedup::1c65cef3dfd1e00c0b03923a1c591db4 | 1241 | 1241 | Swansea Metropolitan University Repository | roar | roar_1241 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1241 | 583 | archive | 1 | NaN | NaN | disk0/00/00/12/41 | 2010-01-06 13:45:32 | 2011-07-18 05:57:23 | 2010-01-06 13:45:32 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://dspace.smu.ac.uk/dspace/ | Swansea Metropolitan University Repository | http://dspace.smu.ac.uk/dspace-oai/request | NaN | NaN | NaN | Users may set up RSS feeds to be alerted to ne... | NaN | NaN | NaN | Swansea Metropolitan University | http://www.smu.ac.uk/ | gb | Swansea | 51.6144 | -3.8727 | dspace | geoname_2_GB | other | NaN | 2008-05-15 11:29:17 | NaN | NaN | 0 | 0 | 0 | 135 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,135,13... | 0 | 0 | 0 | 0 | [opendoar, celestial] | [1779, 1627] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_1241 |
4 | dedup::4217ec5d78c4bc4e5bd006783482441f | 15142 | 15142 | Repositorio Institucional | roar | roar_15142 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 15142 | 11 | archive | 12132 | NaN | NaN | disk0/00/01/51/42 | 2020-08-08 12:35:50 | 2021-01-25 22:45:10 | 2020-08-08 12:35:50 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://repositorio.undar.edu.pe/ | Repositorio Institucional | http://repositorio.undar.edu.pe/ | NaN | NaN | NaN | NaN | FALSE | FALSE | FALSE | NaN | NaN | pe | huanuco | -9.9269 | -76.2396 | dspace | geoname_2_PE | other | NaN | 2019-09-02 21:20:31 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | opendoar | http://v2.sherpa.ac.uk/id/repository/4422 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_15142 |
In [25]:
dup_hybrid = dup_hybrid.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_hybrid = dup_hybrid.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_hybrid.head()
Out[25]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects_phrases | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::471c50ad1a156d7256eddfd747d77931 | opendoar____::6351bf9dce654515bf1ddbd6426dfa97 | 1996 | ehtc repositorio institucional | OpenDOAR | OpenDOAR_1996 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1996 | {"name": "ehtc repositorio institucional", "la... | [] | http://www.repositorio.ehtc.cu/jspui/ | this site provides access to the hospitality a... | institutional | ["es"] | 2019-10-17 14:34:31 | 2010-12-01 11:11:57 | [business and economics, education] | [journal_articles, conference_and_workshop_pap... | [{'name': 'escuela de hotelería y turismo de c... | [] | {"name": "dspace", "version": "1.6.2"} | NaN | yes | OpenDOAR_1996 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | dedup::69dafe8b58066478aea48f3d0f384820 | 2312 | 2312 | Göteborgs universitets publikationer - e-publi... | roar | roar_2312 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2312 | 736 | archive | 1 | NaN | NaN | disk0/00/00/23/12 | 2010-01-14 12:10:06 | 2011-07-18 06:01:08 | 2010-01-14 12:10:06 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | http://gupea.ub.gu.se/dspace/index.jsp | Göteborgs universitets publikationer - e-publi... | http://gupea.ub.gu.se/dspace-oai/request | NaN | NaN | NaN | This is an institutional repository providing ... | FALSE | FALSE | FALSE | Göteborgs Universitet | http://www.gu.se/ | se | NaN | 57.6975 | 11.9608 | dspace | NaN | other | NaN | 2005-06-07 12:57:08 | NaN | NaN | 0 | 0 | 0 | 96 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... | NaN | NaN | NaN | NaN | [opendoar, celestial] | [1832, 1149] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_2312 |
2 | dedup::8f822ac814829da24a7065b8131bdf47 | opendoar____::a34bacf839b923770b2c360eefa26748 | 1035 | kitami institute of technology repository | OpenDOAR | OpenDOAR_1035 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1035 | {"name": "kitami institute of technology repos... | [{'name': '北見工業大学学術機関リポジトリ kit-r', 'language':... | https://kitami-it.repo.nii.ac.jp/ | this site is a university repository providing... | institutional | ["ja", "en"] | 2020-09-09 11:57:56 | 2007-10-09 09:09:40 | [technology general] | [journal_articles, unpub_reports_and_working_p... | [{'name': 'kitami institute of technology', 'a... | [] | {"name": "weko", "version": ""} | http://kitami-it.repo.nii.ac.jp/oai | yes | OpenDOAR_1035 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | dedup::63a99723ebb3af94d52b474c3b21dbe1 | 5779 | 5779 | Sanok Digital Library | roar | roar_5779 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5779 | 9 | archive | 8 | NaN | NaN | disk0/00/00/57/79 | 2012-12-12 04:54:20 | 2012-12-15 02:36:20 | 2012-12-12 04:54:20 | other | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | http://sanockabibliotekacyfrowa.pl/dlibra | Sanok Digital Library | http://sanockabibliotekacyfrowa.pl/dlibra/oai-... | NaN | NaN | NaN | This site provides access to the digitised col... | NaN | NaN | NaN | Digital-Center | http://www.digital-center.pl/ | pl | NaN | 52.4872 | 16.8493 | NaN | geoname_2_PL | other | NaN | 2012-08-05 15:12:12 | NaN | NaN | 0 | 0 | 0 | 19 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,19,19... | NaN | NaN | NaN | NaN | [opendoar, celestial] | [2545, 5072] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_5779 |
4 | dedup::82680bfec0fa08346c1b10d30a3e3d4a | 11212 | 11212 | Publication Server of the Wuppertal Institute | roar | roar_11212 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11212 | 12 | archive | 5611 | NaN | NaN | disk0/00/01/12/12 | 2016-05-04 11:37:14 | 2016-05-07 01:37:18 | 2016-05-04 11:37:14 | institutional | NaN | NaN | show | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | https://epub.wupperinst.org/home | Publication Server of the Wuppertal Institute\... | https://epub.wupperinst.org/oai | NaN | https://epub.wupperinst.org/rss | NaN | \n\nOn this Publication Server of the Wupperta... | TRUE | TRUE | FALSE | Wuppertal Institut für Klima, Umwelt, Energie | http://wupperinst.org/ | de | Wuppertal | 51.2562 | 7.1508 | opus | geoname_2_DE | other | [HB, GE, T1] | 2016-04-28 13:58:38 | NaN | please delete ID 5891 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [opendoar, celestial] | [2539, 6112] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | roar_11212 |
In [26]:
dup_across = dup_across.merge(fairsharing_df, left_on='unique_id', right_on='FAIRsharing_unique_id', how='left')
dup_across = dup_across.merge(re3data_df, left_on='unique_id', right_on='re3data_unique_id', how='left')
dup_across = dup_across.merge(opendoar_df, left_on='unique_id', right_on='OpenDOAR_unique_id', how='left')
dup_across = dup_across.merge(roar_df, left_on='unique_id', right_on='roar_unique_id', how='left')
dup_across.head()
Out[26]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects_phrases | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::860320be12a1c050cd7731794e231bd3 | opendoar____::2290a7385ed77cc5592dc2153229f082 | 1064 | oxford university research archive | OpenDOAR | OpenDOAR_1064 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1064 | {"name": "oxford university research archive",... | [{'acronym': 'ora'}] | http://ora.ox.ac.uk | this site provides access to the collected res... | institutional | ["zh", "nl", "en", "fr", "de", "it", "ja", "pt... | 2021-09-13 13:35:44 | 2007-10-10 16:16:02 | [multidisciplinary] | [journal_articles, conference_and_workshop_pap... | [{'name': 'university of oxford', 'alternative... | [{"policy_url": "https://libguides.bodleian.ox... | {"name": "fedora", "version": "4.6.2"} | https://ora.ox.ac.uk/oai2 | yes | OpenDOAR_1064 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | dedup::1aa7a8773e6a7fdacbcedf9999009a38 | opendoar____::191f8f858acda435ae0daf994e2a72c2 | 8648 | digital commons@georgia southern | OpenDOAR | OpenDOAR_8648 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8648 | {"name": "digital commons@georgia southern", "... | [] | https://digitalcommons.georgiasouthern.edu | this site provides access to the research outp... | institutional | ["en"] | 2021-02-18 18:13:34 | 2019-09-28 04:24:47 | [multidisciplinary] | [journal_articles, conference_and_workshop_pap... | [{'name': 'georgia southern university', 'alte... | [] | {"name": "digital_commons", "version": ""} | https://digitalcommons.georgiasouthern.edu/do/oai | yes | OpenDOAR_8648 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | dedup::31bceb0c3e2a260593e1e36655ebcee4 | opendoar____::d5776aeecb3c45ab15adce6f5cb355f3 | 9713 | materials data repository | OpenDOAR | OpenDOAR_9713 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9713 | {"name": "materials data repository", "languag... | [{'acronym': 'mdr'}] | https://mdr.nims.go.jp | mdr : materials data repository is a data repo... | institutional | ["en", "ja"] | 2021-05-21 18:04:32 | 2020-07-13 10:09:55 | [science general] | [journal_articles, conference_and_workshop_pap... | [{'name': 'national institute for materials sc... | [] | {"name": "fedora", "version": ""} | https://mdr.nims.go.jp/catalog/oai | yes | OpenDOAR_9713 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | dedup::e37b08dd3015330dcbb5d6663667b8b8 | opendoar____::18997733ec258a9fcaf239cc55d53363 | 427 | digital repository at the university of maryland | OpenDOAR | OpenDOAR_427 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 427 | {"name": "digital repository at the university... | [{'acronym': 'drum'}] | http://drum.lib.umd.edu/ | this site is a university repository providing... | institutional | ["en"] | 2021-09-13 13:35:39 | 2006-08-04 09:09:20 | [multidisciplinary] | [journal_articles, theses_and_dissertations, u... | [{'name': 'university of maryland', 'alternati... | [{"policy_url": "http://drum.lib.umd.edu/page/... | {"name": "dspace", "version": "4.1.0"} | http://drum.lib.umd.edu/oai/request | yes | OpenDOAR_427 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | dedup::2841194266115ac1cc04d19630cde46b | re3data_____::3afbb2b45a3dd218a5a091ca773cf6c5 | r3d100011189 | PRISM: University of Calgary's Digital Repository | re3data | re3data_r3d100011189 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | r3d100011189 | PRISM: University of Calgary's Digital Repository | eng | [] | https://prism.ucalgary.ca/ | [OpenDOAR:7771] | ["digitize@ucalgary.ca", "kmeranji@ucalgary.ca"] | PRISM is a digital archive of the University o... | eng | [institutional] | {"size": "", "updatedp": ""} | NaN | NaN | ["eng"] | [1 Humanities and Social Sciences, 11 Humaniti... | NaN | [Audiovisual data, Images, Standard office doc... | [dataProvider] | [multidisciplinary] | [{'institutionName': 'University of Calgary, L... | [{"policyName": "Open Access Mandate", "policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Submission Policy"... | ["DSpace"] | NaN | [] | ["DOI", "hdl"] | NaN | [] | no | yes | [] | [] | {"syndication": "http://prism.ucalgary.ca/feed... | NaN | 2014-10-20 | 2020-01-09 | re3data_r3d100011189 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [27]:
dup_within = dup_within.groupby('dedup_id').aggregate(list).reset_index()
dup_within['source_set'] = dup_within.source.map(set)
dup_within.head()
<ipython-input-27-3881fa0a0224>:1: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()` <ipython-input-27-3881fa0a0224>:2: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`
Out[27]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects_phrases | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | source_set | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::000871c1fc726f0b52dc86a4eeb027de | [4612, 4649] | [4612, 4649] | [IIT Bombay Institutional Repository, IIT Bomb... | [roar, roar] | [roar_4612, roar_4649] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [4612, 4649] | [28, 8] | [archive, archive] | [1380, 1380] | [nan, nan] | [nan, nan] | [disk0/00/00/46/12, disk0/00/00/46/49] | [2012-01-08 03:17:02, 2012-02-05 13:57:01] | [2012-04-16 10:53:04, 2012-04-16 10:39:58] | [2012-01-08 03:17:02, 2012-02-05 13:57:01] | [institutional, institutional] | [nan, nan] | [nan, nan] | [show, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://dspace.library.iitb.ac.in/jspui/, http... | [IIT Bombay Institutional Repository, IIT Bomb... | [http://dspace.library.iitb.ac.in/oai/request,... | [nan, nan] | [http://dspace.library.iitb.ac.in/xmlui/feed/a... | [nan, nan] | [nan, nan] | [TRUE, TRUE] | [TRUE, TRUE] | [TRUE, FALSE] | [IIT Bombay, IIT Bombay] | [http://www.iitb.ac.in, http://www.iitb.ac.in] | [in, in] | [Mumbai, Mumbai] | [19.133, 19.133] | [72.9166, 72.9166] | [dspace, dspace] | [geoname_2_IN, geoname_2_IN] | [other, other] | TP, TN, TJ, TH, TK, TD, TA], [TA, T1 | [2011-12-15 09:01:35, 2012-01-05 12:09:37] | [nan, nan] | [nan, nan] | [0, nan] | [0, nan] | [0, nan] | [99, nan] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,95,... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [celestial, celestial] | [4790, 4789] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_4612, roar_4649] | {roar} |
1 | dedup::0163cceb20f5ca7b313419c068abd9dc | [7943, 8003] | [7943, 8003] | [EPrints@NIRT Library Welcomes! - EPrints@NITR... | [roar, roar] | [roar_7943, roar_8003] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [7943, 8003] | [16, 19] | [archive, archive] | [4963, 5023] | [nan, nan] | [nan, nan] | [disk0/00/00/79/43, disk0/00/00/80/03] | [2014-03-11 11:54:06, 2014-03-30 18:13:01] | [2014-05-08 13:07:12, 2014-05-08 12:55:41] | [2014-03-11 11:54:06, 2014-03-30 18:13:01] | [institutional, institutional] | [nan, nan] | [nan, nan] | [show, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://eprints.nirt.res.in/, http://eprints.n... | [EPrints@NIRT Library Welcomes! - EPrints@NITR... | [http://eprints.nirt.res.in/cgi/oai2, http://e... | [nan, nan] | [http://eprints.nirt.res.in/cgi/latest_tool?ou... | [nan, nan] | [This is the Institutional Repository of the N... | [TRUE, FALSE] | [TRUE, FALSE] | [FALSE, FALSE] | [National Institute for Research in Tuberculos... | [http://www.nirt.res.in/, http://www.nirt.res.in] | [in, in] | [Chennai, Chennai (Madras)] | [nan, 13] | [nan, 80] | [eprints, eprints] | [geoname_2_IN, geoname_2_IN] | [3.3.15 eps, 3.3.15 eps] | RB, RM], [R1, RZ | [2014-03-07 15:07:45, 2014-03-19 07:05:04] | [The National Institute for Research in Tuberc... | [nan, Please include "Tuberculosis" as a Speci... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [[opendoar, celestial], celestial] | [[5410, 2725], 5430] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_7943, roar_8003] | {roar} |
2 | dedup::028ee724157b05d04e7bdcf237d12e60 | [2670, 2698, 2741] | [2670, 2698, 2741] | [HSF Brage Open Research Archive, HSF Brage Op... | [roar, roar, roar] | [roar_2670, roar_2698, roar_2741] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [2670, 2698, 2741] | [470, 317, 231] | [archive, archive, archive] | [235, 8, 8] | [nan, nan, nan] | [nan, nan, nan] | [disk0/00/00/26/70, disk0/00/00/26/98, disk0/0... | [2010-05-04 02:19:51, 2010-05-13 11:01:53, 201... | [2011-07-18 06:02:42, 2011-07-06 08:24:10, 201... | [2010-05-04 02:19:51, 2010-05-13 11:01:53, 201... | [institutional, institutional, institutional] | [nan, nan, nan] | [nan, nan, nan] | [show, show, show] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [0, 0, 0] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [http://brage.bibsys.no/hsf/, http://brage.bib... | [HSF Brage Open Research Archive, HSF Brage Op... | [http://oai.bibsys.no/oai/repository/nora_hsf_... | [http://brage.bibsys.no/hsf/?locale=en, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [This site provides access to the research out... | [TRUE, FALSE, FALSE] | [TRUE, FALSE, FALSE] | [FALSE, FALSE, FALSE] | [Sogn og Fjordane University College, Høgskule... | [http://www.hisf.no/, http://www.hisf.no/, htt... | [no, no, no] | [Sogndal, nan, nan] | [61.2174, 61.2174, 60.3904] | [7.1082, 7.1082, 5.3332] | [dspace, dspace, dspace] | [geoname_2_NO, nan, nan] | [other, other, other] | [nan, nan, nan] | [2010-04-06 13:51:52, 2010-05-09 15:12:16, 201... | [nan, nan, nan] | [nan, nan, nan] | [0, nan, nan] | [0, nan, nan] | [0, nan, nan] | [50, nan, nan] | [0,0,1,1,1,4,4,6,6,7,8,11,12,14,15,17,18,18,18... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [[opendoar, celestial], opendoar, opendoar] | [[2426, 1781], 1781, 1807] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [roar_2670, roar_2698, roar_2741] | {roar} |
3 | dedup::03593ce517feac573fdaafa6dcedef61 | [4393, 4394] | [4393, 4394] | [Institutional Repository of Kunming Institute... | [roar, roar] | [roar_4393, roar_4394] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [4393, 4394] | [14, 14] | [archive, archive] | [986, 986] | [nan, nan] | [nan, nan] | [disk0/00/00/43/93, disk0/00/00/43/94] | [2011-11-09 23:14:52, 2011-11-09 23:14:46] | [2012-02-06 06:58:40, 2012-02-06 06:58:41] | [2011-11-09 23:14:52, 2011-11-09 23:14:46] | [institutional, institutional] | [nan, nan] | [nan, nan] | [show, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://159.226.149.42:8088/, http://159.226.1... | [Institutional Repository of Kunming Institute... | [http://159.226.149.42:8088/casirgrid-oai/requ... | [nan, nan] | [nan, nan] | [nan, nan] | [This site provides access to the output of th... | [TRUE, TRUE] | [TRUE, TRUE] | [FALSE, FALSE] | [ Kunming Institute of Zoology Chinese Academy... | [http://www.kiz.ac.cn/, http://www.kiz.ac.cn/] | [cn, cn] | [kunming, kunming] | [25.0416, 25.0416] | [102.755, 102.755] | [dspace, dspace] | [geoname_2_CN, geoname_2_CN] | [other, other] | [nan, nan] | [2010-07-22 16:00:13, 2010-07-22 16:00:13] | [nan, nan] | [nan, nan] | [0, 0] | [0, 0] | [0, 0] | [100, 100] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [celestial, celestial] | [4715, 4715] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_4393, roar_4394] | {roar} |
4 | dedup::03e0704b5690a2dee1861dc3ad3316c9 | [1019, 5550] | [1019, 5550] | [PolyU Institutional Repository, PolyU Institu... | [roar, roar] | [roar_1019, roar_5550] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [1019, 5550] | [526, 9] | [archive, archive] | [1, 8] | [nan, nan] | [nan, nan] | [disk0/00/00/10/19, disk0/00/00/55/50] | [2010-01-06 13:45:03, 2012-12-12 01:25:48] | [2012-01-19 11:35:09, 2012-12-17 06:53:14] | [2010-01-06 13:45:03, 2012-12-12 01:25:48] | [institutional, institutional] | [nan, nan] | [nan, nan] | [show, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [0, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://repository.lib.polyu.edu.hk/, http://r... | [PolyU Institutional Repository, PolyU Institu... | [http://repository.lib.polyu.edu.hk/oai/reques... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, This is an Institutional repository prov... | [TRUE, nan] | [TRUE, nan] | [nan, nan] | [The Hong Kong Polytechnic University Pao Yue-... | [http://www.lib.polyu.edu.hk, http://www.polyu... | [hk, cn] | [Hong Kong, nan] | [22.25, 22.3964] | [114.167, 114.109] | [dspace, dspace] | [geoname_2_HK, geoname_2_CN] | [other, other] | [nan, nan] | [2008-10-30 07:50:38, 2012-07-01 15:13:40] | [nan, nan] | [nan, nan] | [0, 0] | [0, 0] | [0, 0] | [86, 86] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,54,71,80,... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [[roarmap, opendoar, celestial], [opendoar, ce... | 193, 1456, 1441], [1456, 1441 | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_1019, roar_5550] | {roar} |
In [28]:
dup_hybrid = dup_hybrid.groupby('dedup_id').aggregate(list).reset_index()
dup_hybrid['source_set'] = dup_hybrid.source.map(set)
dup_hybrid.head()
<ipython-input-28-89649d18870f>:1: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()` <ipython-input-28-89649d18870f>:2: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`
Out[28]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects_phrases | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | source_set | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::01b6397888c09d84f3dc89d807aa1004 | [4745, opendoar____::a9365bd906e11324065c35be4... | [4745, 2429, 4320] | [RU-Económicas, ru-económicas, ru económicas] | [roar, OpenDOAR, OpenDOAR] | [roar_4745, OpenDOAR_2429, OpenDOAR_4320] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, 2429, 4320] | [nan, {"name": "ru-econ\u00f3micas", "language... | [nan, [], []] | [nan, http://ru.iiec.unam.mx/, http://ru.iiec.... | [nan, this site provides access to the researc... | [nan, institutional, institutional] | [nan, ["es"], ["es"]] | [nan, 2021-09-13 13:35:56, 2021-09-13 13:36:17] | [nan, 2012-02-28 12:12:09, 2019-02-19 10:51:49] | [nan, [multidisciplinary], [business and econo... | [nan, [journal_articles, theses_and_dissertati... | [nan, [{'name': 'universidad nacional autónoma... | [nan, [{"policy_url": "http://ru.iiec.unam.mx/... | [nan, {"name": "eprints", "version": "3.3.15"}... | [nan, http://ru.iiec.unam.mx/cgi/oai2, nan] | [nan, yes, yes] | [nan, OpenDOAR_2429, OpenDOAR_4320] | [4745, nan, nan] | [31, nan, nan] | [archive, nan, nan] | [1447, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [disk0/00/00/47/45, nan, nan] | [2012-02-05 14:27:15, nan, nan] | [2012-04-16 10:34:36, nan, nan] | [2012-02-05 14:27:15, nan, nan] | [institutional, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [show, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [http://ru.iiec.unam.mx, nan, nan] | [RU-Económicas, nan, nan] | [http://ru.iiec.unam.mx/cgi/oai2, nan, nan] | [nan, nan, nan] | [http://ru.iiec.unam.mx/cgi/latest_tool?output... | [nan, nan, nan] | [Productos académicos del Instituto de Investi... | [TRUE, nan, nan] | [TRUE, nan, nan] | [TRUE, nan, nan] | [Instituto de Investigaciones Económicas UNAM,... | [http://www.iiec.unam.mx/, nan, nan] | [mx, nan, nan] | [Mexico, nan, nan] | [19.3162, nan, nan] | [-99.1799, nan, nan] | [eprints, nan, nan] | [geoname_2_MX, nan, nan] | [3.3.15 eps, nan, nan] | [[GF, HJ, HT, HB, HM, HC, HX, HN, H1, G1, T1, ... | [2012-02-03 05:18:16, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [0, nan, nan] | [0, nan, nan] | [0, nan, nan] | [94, nan, nan] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,7... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [[opendoar, celestial], nan, nan] | [[2429, 4818], nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [roar_4745, nan, nan] | {roar, OpenDOAR} |
1 | dedup::03db60c2331018b18c4166c1787072fe | [opendoar____::78bc62d08a9a0b9b0b9c0ad339ef82d... | [3087, 4500, 8504] | [landmark university repository, landmark univ... | [OpenDOAR, OpenDOAR, roar] | [OpenDOAR_3087, OpenDOAR_4500, roar_8504] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [3087, 4500, nan] | [{"name": "landmark university repository", "l... | [[], [], nan] | [http://eprints.lmu.edu.ng/, http://eprints.lm... | [this site provides access to the multi-discip... | [institutional, institutional, nan] | [["en"], ["en"], nan] | [2021-09-13 13:36:06, 2021-02-18 18:01:12, nan] | [2014-06-16 13:36:00, 2019-03-26 14:07:30, nan] | [[multidisciplinary], [multidisciplinary], nan] | [[journal_articles], [journal_articles, biblio... | [[{'name': 'landmark university', 'alternative... | [[{"policy_url": "http://eprints.lmu.edu.ng/po... | [{"name": "eprints", "version": "3.3.12"}, {"n... | [http://eprints.lmu.edu.ng/cgi/oai2, nan, nan] | [yes, yes, nan] | [OpenDOAR_3087, OpenDOAR_4500, nan] | [nan, nan, 8504] | [nan, nan, 12] | [nan, nan, archive] | [nan, nan, 5459] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, disk0/00/00/85/04] | [nan, nan, 2014-06-24 10:14:07] | [nan, nan, 2014-06-28 01:38:49] | [nan, nan, 2014-06-24 10:14:07] | [nan, nan, institutional] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, show] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, http://eprints.lmu.edu.ng] | [nan, nan, Landmark University Repository] | [nan, nan, http://eprints.lmu.edu.ng/cgi/oai] | [nan, nan, nan] | [nan, nan, http://eprints.lmu.edu.ng/cgi/lates... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, TRUE] | [nan, nan, TRUE] | [nan, nan, TRUE] | [nan, nan, Landmark University] | [nan, nan, http://lmu.edu.ng] | [nan, nan, ng] | [nan, nan, Omu-Aran] | [nan, nan, 8.12421] | [nan, nan, 5.09488] | [nan, nan, eprints] | [nan, nan, geoname_2_NG] | [nan, nan, 3.3.16 eps] | [nan, nan, nan] | [nan, nan, 2014-06-07 22:16:23] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, [opendoar, celestial]] | [nan, nan, [5621, 3087]] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, roar_8504] | {roar, OpenDOAR} |
2 | dedup::05128e44e27c36bdba71221bfccf735d | [opendoar____::426f990b332ef8193a61cc90516c124... | [2318, 5503, 4271] | [iława biblioteka cyrfrowa (iława digital libr... | [OpenDOAR, roar, roar] | [OpenDOAR_2318, roar_5503, roar_4271] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [2318, nan, nan] | [{"name": "i\u0142awa biblioteka cyrfrowa (i\u... | [[], nan, nan] | [http://ibc.ilawa.pl/dlibra, nan, nan] | [this site provides access to digitised articl... | [governmental, nan, nan] | [["pl"], nan, nan] | [2019-10-17 14:34:36, nan, nan] | [2011-10-11 13:13:58, nan, nan] | [[multidisciplinary], nan, nan] | [[journal_articles], nan, nan] | [[{'name': 'iława', 'alternativeName': '', 'co... | [[], nan, nan] | [{"name": "dlibra", "version": "4"}, nan, nan] | [http://ibc.ilawa.pl/dlibra/oai-pmh-repository... | [yes, nan, nan] | [OpenDOAR_2318, nan, nan] | [nan, 5503, 4271] | [nan, 9, 11] | [nan, archive, archive] | [nan, 8, 8] | [nan, nan, nan] | [nan, nan, nan] | [nan, disk0/00/00/55/03, disk0/00/00/42/71] | [nan, 2012-11-19 20:33:30, 2011-10-27 01:25:14] | [nan, 2012-11-26 06:53:42, 2011-12-19 07:07:23] | [nan, 2012-11-19 20:33:30, 2011-10-27 01:25:14] | [nan, other, other] | [nan, nan, nan] | [nan, nan, nan] | [nan, show, show] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, http://ibc.ilawa.pl/dlibra, http://ibc.i... | [nan, Iława Biblioteka Cyrfrowa (Iława Digital... | [nan, http://ibc.ilawa.pl/dlibra/oai-pmh-repos... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, This site provides access to digitised a... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, Iława, Iława] | [nan, http://www.ilawa.pl/_portal, http://www.... | [nan, pl, pl] | [nan, nan, nan] | [nan, 53.596, 53.596] | [nan, 19.5684, 19.5684] | [nan, nan, nan] | [nan, geoname_2_PL, geoname_2_PL] | [nan, other, other] | [nan, nan, nan] | [nan, 2012-07-01 15:13:09, 2009-10-12 10:46:08] | [nan, nan, nan] | [nan, nan, nan] | [nan, 0, 0] | [nan, 0, 0] | [nan, 0, 0] | [nan, 20, 20] | [nan, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, [opendoar, celestial], [opendoar, celest... | [nan, [2318, 4672], [2318, 4672]] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, roar_5503, roar_4271] | {roar, OpenDOAR} |
3 | dedup::069059b7ef840f0c74a814ec9237b6ec | [5711, 126, opendoar____::1cd3882394520876dc88... | [5711, 126, 1509] | [Bibioteca Digital Ação Educativa, Biblioteca ... | [roar, roar, OpenDOAR] | [roar_5711, roar_126, OpenDOAR_1509] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, 1509] | [nan, nan, {"name": "bibioteca digital a\u00e7... | [nan, nan, []] | [nan, nan, http://www.bdae.org.br/dspace/] | [nan, nan, this site provides access to the ou... | [nan, nan, institutional] | [nan, nan, ["pt"]] | [nan, nan, 2019-10-17 14:34:23] | [nan, nan, 2009-05-01 10:10:47] | [nan, nan, [education]] | [nan, nan, [theses_and_dissertations, unpub_re... | [nan, nan, [{'name': 'ação educativa', 'altern... | [nan, nan, []] | [nan, nan, {"name": "dspace", "version": ""}] | [nan, nan, http://www.bdae.org.br/dspace-oai/r... | [nan, nan, yes] | [nan, nan, OpenDOAR_1509] | [5711, 126, nan] | [9, 503, nan] | [archive, archive, nan] | [8, 1, nan] | [nan, nan, nan] | [nan, nan, nan] | [disk0/00/00/57/11, disk0/00/00/01/26, nan] | [2012-12-12 04:37:14, 2010-01-06 13:43:56, nan] | [2012-12-17 06:53:38, 2011-07-18 05:42:07, nan] | [2012-12-12 04:37:14, 2010-01-06 13:43:56, nan] | [institutional, other, nan] | [nan, nan, nan] | [nan, nan, nan] | [show, show, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, 0, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [http://www.bdae.org.br/dspace/, http://www.bd... | [Bibioteca Digital Ação Educativa, Biblioteca ... | [http://www.bdae.org.br/dspace-oai/request, ht... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [This site provides access to the output of th... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [Ação Educativa, Ação Educativa, nan] | [http://www.acaoeducativa.org/, http://www.aca... | [br, br, nan] | [nan, São Paulo, nan] | [-23.5445, -23.5445, nan] | [-46.6509, -46.6509, nan] | [dspace, dspace, nan] | [geoname_2_BR, geoname_2_BR, nan] | [other, other, nan] | [nan, nan, nan] | [2012-07-22 15:12:34, 2008-03-31 20:07:33, nan] | [nan, nan, nan] | [nan, nan, nan] | [0, 0, nan] | [0, 0, nan] | [0, 0, nan] | [100, 100, nan] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,97,100,... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [[opendoar, celestial], [opendoar, celestial],... | [[1430, 1509], [1430, 1509], nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [roar_5711, roar_126, nan] | {roar, OpenDOAR} |
4 | dedup::0e139b17a92b2df7d6c3c840e51465fe | [4379, 4266, opendoar____::f976b57bb9dd27aa2e7... | [4379, 4266, 2306] | [Institutional Repository of Ningbo Institute ... | [roar, roar, OpenDOAR] | [roar_4379, roar_4266, OpenDOAR_2306] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, 2306] | [nan, nan, {"name": "institutional repository ... | [nan, nan, [{'acronym': 'nimte openir'}]] | [nan, nan, http://ir.nimte.ac.cn/] | [nan, nan, this site provides access to the ou... | [nan, nan, institutional] | [nan, nan, ["zh", "en"]] | [nan, nan, 2019-10-17 14:34:36] | [nan, nan, 2011-10-10 13:13:11] | [nan, nan, [technology general, mechanical eng... | [nan, nan, [journal_articles, bibliographic_re... | [nan, nan, [{'name': 'chinese academy of scien... | [nan, nan, []] | [nan, nan, {"name": "dspace", "version": ""}] | [nan, nan, http://ir.nimte.ac.cn/casirgrid-oai... | [nan, nan, yes] | [nan, nan, OpenDOAR_2306] | [4379, 4266, nan] | [15, 11, nan] | [archive, archive, nan] | [986, 8, nan] | [nan, nan, nan] | [nan, nan, nan] | [disk0/00/00/43/79, disk0/00/00/42/66, nan] | [2011-11-09 23:16:22, 2011-10-27 01:26:05, nan] | [2011-12-21 15:25:04, 2011-12-19 07:07:21, nan] | [2011-11-09 23:16:22, 2011-10-27 01:26:05, nan] | [institutional, institutional, nan] | [nan, nan, nan] | [nan, nan, nan] | [show, show, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [http://ir.nimte.ac.cn/, http://ir.nimte.ac.cn... | [Institutional Repository of Ningbo Institute ... | [http://ir.nimte.ac.cn/casirgrid-oai/request, ... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [This site provides access to the output of th... | [TRUE, nan, nan] | [TRUE, nan, nan] | [FALSE, nan, nan] | [Ningbo Institute of Material Technology & Eng... | [http://www.nimte.ac.cn/, http://www.cas.cn/, ... | [cn, cn, nan] | [ningbo, nan, nan] | [29.8807, 29.8807, nan] | [121.672, 121.672, nan] | [dspace, dspace, nan] | [geoname_2_CN, geoname_2_CN, nan] | [other, other, nan] | [nan, nan, nan] | [2009-12-21 02:27:07, 2009-12-21 02:27:07, nan] | [nan, nan, nan] | [nan, nan, nan] | [0, 0, nan] | [0, 0, nan] | [0, 0, nan] | [100, 100, nan] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0... | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [celestial, [opendoar, celestial], nan] | [4668, [4668, 2306], nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [nan, nan, nan] | [roar_4379, roar_4266, nan] | {roar, OpenDOAR} |
In [29]:
dup_across = dup_across.groupby('dedup_id').aggregate(list).reset_index()
dup_across['source_set'] = dup_across.source.map(set)
dup_across.head()
<ipython-input-29-7abf9225ca42>:1: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()` <ipython-input-29-7abf9225ca42>:2: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider using pd.concat instead. To get a de-fragmented frame, use `newframe = frame.copy()`
Out[29]:
dedup_id | duplicate_id | original_id | name | source | unique_id | FAIRsharing_id | FAIRsharing_type | FAIRsharing_attributes.created-at | FAIRsharing_attributes.updated-at | FAIRsharing_attributes.metadata.doi | FAIRsharing_attributes.metadata.name | FAIRsharing_attributes.metadata.status | FAIRsharing_attributes.metadata.contacts | FAIRsharing_attributes.metadata.homepage | FAIRsharing_attributes.metadata.identifier | FAIRsharing_attributes.metadata.description | FAIRsharing_attributes.metadata.support-links | FAIRsharing_attributes.metadata.year-creation | FAIRsharing_attributes.metadata.data-processes | FAIRsharing_attributes.legacy-ids | FAIRsharing_attributes.fairsharing-registry | FAIRsharing_attributes.record-type | FAIRsharing_attributes.subjects | FAIRsharing_attributes.domains | FAIRsharing_attributes.taxonomies | FAIRsharing_attributes.user-defined-tags | FAIRsharing_attributes.countries | FAIRsharing_attributes.name | FAIRsharing_attributes.abbreviation | FAIRsharing_attributes.url | FAIRsharing_attributes.doi | FAIRsharing_attributes.fairsharing-licence | FAIRsharing_attributes.description | FAIRsharing_attributes.publications | FAIRsharing_attributes.licence-links | FAIRsharing_attributes.metadata.citations | FAIRsharing_attributes.metadata.abbreviation | FAIRsharing_attributes.metadata.access-points | FAIRsharing_attributes.metadata.associated-tools | FAIRsharing_attributes.metadata.deprecation-date | FAIRsharing_attributes.metadata.deprecation-reason | FAIRsharing_attributes.metadata.tombstone | FAIRsharing_unique_id | re3data_orgIdentifier | re3data_repositoryName | re3data_repositoryName.language | re3data_additionalName | re3data_repositoryURL | re3data_repositoryIdentifier | re3data_repositoryContact | re3data_description | re3data_description.language | re3data_type | re3data_size | re3data_startDate | re3data_endDate | re3data_repositoryLanguage | re3data_subject | re3data_missionStatementURL | re3data_contentType | re3data_providerType | re3data_keyword | re3data_institution | re3data_policy | re3data_databaseAccess | re3data_databaseLicense | re3data_dataAccess | re3data_dataLicense | re3data_dataUploadType | re3data_dataUploadLicense | re3data_software | re3data_versioning | re3data_api | re3data_pidSystem | re3data_citationGuidelineURL | re3data_aidSystem | re3data_enhancedPublication | re3data_qualityManagement | re3data_certificate | re3data_metadataStandard | re3data_syndication | re3data_remarks | re3data_entryDate | re3data_lastUpdate | re3data_unique_id | OpenDOAR_system_metadata.id | OpenDOAR_repository_metadata.name | OpenDOAR_repository_metadata.alternativename | OpenDOAR_repository_metadata.url | OpenDOAR_repository_metadata.description | OpenDOAR_repository_metadata.type | OpenDOAR_repository_metadata.content_languages | OpenDOAR_system_metadata.date_modified | OpenDOAR_system_metadata.date_created | OpenDOAR_repository_metadata.content_subjects_phrases | OpenDOAR_repository_metadata.content_types | OpenDOAR_organization | OpenDOAR_policy_urls | OpenDOAR_repository_metadata.software | OpenDOAR_repository_metadata.oai_url | OpenDOAR_system_metadata.publicly_visible | OpenDOAR_unique_id | roar_eprintid | roar_rev_number | roar_eprint_status | roar_userid | roar_importid | roar_source | roar_dir | roar_datestamp | roar_lastmod | roar_status_changed | roar_type | roar_succeeds | roar_commentary | roar_metadata_visibility | roar_latitude | roar_longitude | roar_relation_type | roar_relation_uri | roar_item_issues_id | roar_item_issues_type | roar_item_issues_description | roar_item_issues_timestamp | roar_item_issues_status | roar_item_issues_reported_by | roar_item_issues_resolved_by | roar_item_issues_comment | roar_item_issues_count | roar_sword_depositor | roar_sword_slug | roar_exemplar | roar_home_page | roar_title | roar_oai_pmh | roar_sword_endpoint | roar_rss_feed | roar_twitter_feed | roar_description | roar_fulltext | roar_open_access | roar_mandate | roar_organisation_title | roar_organisation_home_page | roar_location_country | roar_location_city | roar_location_latitude | roar_location_longitude | roar_software | roar_geoname | roar_version | roar_subjects | roar_date | roar_note | roar_suggestions | roar_activity_low | roar_activity_medium | roar_activity_high | roar_recordcount | roar_recordhistory | roar_fulltexts_total | roar_fulltexts_docs | roar_fulltexts_rtotal | roar_fulltexts_rdocs | roar_registry_name | roar_registry_id | roar_submit_to | roar_submitted_to_name | roar_submitted_to_done | roar_webometrics_rank | roar_webometrics_size | roar_webometrics_visibility | roar_webometrics_rich_files | roar_webometrics_scholar | roar_monthly_deposits | roar_total_deposits | roar_association | roar_unique_id | source_set | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | dedup::001e6d882e54c780ce269d3c46997287 | [re3data_____::4af9fe2bb93511a5e0f0c39e94d6557... | [r3d100011306, 2094] | [RESID Database of Protein Modifications, RESI... | [re3data, FAIRsharing] | [re3data_r3d100011306, FAIRsharing_2094] | [nan, 2094] | [nan, fairsharing-records] | [nan, 2014-11-04T15:23:40.000Z] | [nan, 2021-09-30T11:38:37.114Z] | [nan, 10.25504/FAIRsharing.qaszjp] | [nan, RESID Database of Protein Modifications] | [nan, ready] | [nan, [{'contact-name': 'John S Garavelli', 'c... | [nan, http://pir.georgetown.edu/resid/] | [nan, 2094.0] | [nan, The RESID Database of Protein Modificati... | [nan, [{'url': 'http://pir.georgetown.edu/resi... | [nan, nan] | [nan, [{'url': 'ftp://ftp.pir.georgetown.edu/p... | [nan, [biodbcore-000563, bsg-d000563]] | [nan, Database] | [nan, knowledgebase] | [nan, [Life Science]] | [nan, [Molecular structure, Small molecule, St... | [nan, [All]] | [nan, []] | [nan, [United Kingdom, European Union, Switzer... | [nan, FAIRsharing record for: RESID Database o... | [nan, RESID] | [nan, https://fairsharing.org/10.25504/FAIRsha... | [nan, 10.25504/FAIRsharing.qaszjp] | [nan, https://creativecommons.org/licenses/by-... | [nan, This FAIRsharing record describes: The R... | [nan, [{'id': 334, 'pubmed_id': 12520062, 'tit... | [nan, [{'licence-name': 'Open Data Commons (OD... | [nan, nan] | [nan, RESID] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, FAIRsharing_2094] | [r3d100011306, nan] | [RESID Database of Protein Modifications, nan] | [eng, nan] | [[], nan] | [https://pir.georgetown.edu/resid/resid.shtml,... | [[FAIRsharing_doi:10.25504/FAIRsharing.qaszjp,... | [["pirmail@georgetown.edu"], nan] | [The RESID Database of Protein Modifications i... | [eng, nan] | [[disciplinary], nan] | [{"size": "", "updatedp": ""}, nan] | [2014, nan] | [nan, nan] | [["eng"], nan] | [[2 Life Sciences, 201 Basic Biological and Me... | [nan, nan] | [[Images, Structured text], nan] | [[dataProvider], nan] | [[genomes, life sciences, proteins, proteomes,... | [[{'institutionName': 'Georgetown University, ... | [[{"policyName": "Terms of Use", "policyURL": ... | [ {"databaseAccessType": "open", "databaseAcce... | [[], nan] | [[{"dataAccessType": "open", "dataAccessRestri... | [[{"dataLicenseName": "Copyrights", "dataLicen... | [closed, nan] | [[], nan] | [["unknown"], nan] | [yes, nan] | [["ftp://ftp.pir.georgetown.edu/databases/", "... | [["none"], nan] | [nan, nan] | [[], nan] | [yes, nan] | [unknown, nan] | [[], nan] | [[], nan] | [{}, nan] | [RESID is covered by Thomson Reuters Data Cita... | [2014-12-05, nan] | [2019-01-17, nan] | [re3data_r3d100011306, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | {FAIRsharing, re3data} |
1 | dedup::0023a1e3447fdb31836536cc903f1310 | [opendoar____::c6f798b844366ccd65d99bc7f31e0e0... | [3410, 10013] | [erucu: electronic repository of the ukrainian... | [OpenDOAR, roar] | [OpenDOAR_3410, roar_10013] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [3410, nan] | [{"name": "erucu: electronic repository of the... | [[], nan] | [http://er.ucu.edu.ua/, nan] | [ukrainian catholic university’s institutional... | [institutional, nan] | [["uk", "en"], nan] | [2019-10-17 14:34:57, nan] | [2015-07-08 12:43:38, nan] | [[multidisciplinary], nan] | [[journal_articles, conference_and_workshop_pa... | [[{'name': 'ukrainian catholic university', 'a... | [[], nan] | [{"name": "dspace", "version": ""}, nan] | [nan, nan] | [yes, nan] | [OpenDOAR_3410, nan] | [nan, 10013] | [nan, 31] | [nan, archive] | [nan, 7104] | [nan, nan] | [nan, nan] | [nan, disk0/00/01/00/13] | [nan, 2015-08-08 14:53:04] | [nan, 2016-03-21 19:54:43] | [nan, 2015-08-08 14:53:04] | [nan, institutional] | [nan, nan] | [nan, nan] | [nan, show] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, http://er.ucu.edu.ua/] | [nan, ErUCU: Electronic repository of the Ukra... | [nan, http://er.ucu.edu.ua/oai/request] | [nan, http://er.ucu.edu.ua/sword/] | [nan, http://er.ucu.edu.ua/feed/rss_2.0/site] | [nan, nan] | [nan, Ukrainian Catholic University’s institut... | [nan, TRUE] | [nan, TRUE] | [nan, TRUE] | [nan, Ukrainian Catholic University] | [nan, http://ucu.edu.ua/eng/] | [nan, ua] | [nan, Lviv] | [nan, nan] | [nan, nan] | [nan, dspace] | [nan, geoname_2_UA] | [nan, other] | [nan, [H1, L1, AC, D204, B1, D1, DK, BF, BS, H... | [nan, 2015-07-07 12:38:37] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, [opendoar, celestial]] | [nan, [3410, 5883]] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, [russell_group, ivy_league]] | [nan, roar_10013] | {roar, OpenDOAR} |
2 | dedup::003ab6b40af9b488decea7c582d150a2 | [https://fairsharing.org/10.25504/FAIRsharing.... | [2315, r3d100011894] | [Synapse, Synapse] | [FAIRsharing, re3data] | [FAIRsharing_2315, re3data_r3d100011894] | [2315, nan] | [fairsharing-records, nan] | [2016-08-02T13:56:30.000Z, nan] | [2021-09-30T11:38:43.134Z, nan] | [10.25504/FAIRsharing.dnxzmk, nan] | [Synapse, nan] | [ready, nan] | [[{'contact-name': 'Meredith Slota', 'contact-... | [https://www.synapse.org/, nan] | [2315.0, nan] | [Synapse is a collaborative research platform ... | [[{'url': 'SynapseInfo@sagebase.org', 'name': ... | [2010.0, nan] | [[{'url': 'https://www.synapse.org/', 'name': ... | [[biodbcore-000791, bsg-d000791], nan] | [Database, nan] | [repository, nan] | [[Biomedical Science, Data Management, Data In... | [[Experimental measurement, Protocol, Data sto... | [[All], nan] | [[], nan] | [[United States], nan] | [FAIRsharing record for: Synapse, nan] | [Synapse, nan] | [https://fairsharing.org/10.25504/FAIRsharing.... | [10.25504/FAIRsharing.dnxzmk, nan] | [https://creativecommons.org/licenses/by-sa/4.... | [This FAIRsharing record describes: Synapse is... | [[{'id': 2450, 'pubmed_id': 24071850, 'title':... | [[{'licence-name': 'Creative Commons Attributi... | [nan, nan] | [Synapse, nan] | [[{'url': 'http://rest-docs.synapse.org/rest/'... | [[{'url': 'https://sage-bionetworks.github.io/... | [nan, nan] | [nan, nan] | [nan, nan] | [FAIRsharing_2315, nan] | [nan, r3d100011894] | [nan, Synapse] | [nan, eng] | [nan, []] | [nan, https://www.synapse.org] | [nan, [RRID:SCR_006307, RRID:nlx_151983]] | [nan, ["synapseinfo@sagebase.org"]] | [nan, Synapse is an open source software platf... | [nan, eng] | [nan, [other]] | [nan, {"size": "", "updatedp": ""}] | [nan, 2012-05-22] | [nan, nan] | [nan, ["eng"]] | [nan, [2 Life Sciences, 201 Basic Biological a... | [nan, https://sagebionetworks.org/tools_resour... | [nan, [Raw data, Scientific and statistical da... | [nan, [dataProvider, serviceProvider]] | [nan, [AMP-AD Knowledge Portal, DREAM Challeng... | [nan, [{'institutionName': 'Alfred P. Sloan Fo... | [nan, [{"policyName": "Synapse Commons Governa... | [nan, {"databaseAccessType": "open", "databas... | [nan, []] | [nan, [{"dataAccessType": "closed", "dataAcces... | [nan, [{"dataLicenseName": "other", "dataLicen... | [nan, restricted] | [nan, []] | [nan, ["unknown"]] | [nan, yes] | [nan, ["https://docs.synapse.org/rest/", "REST"]] | [nan, ["DOI"]] | [nan, nan] | [nan, []] | [nan, yes] | [nan, yes] | [nan, []] | [nan, []] | [nan, {}] | [nan, nan] | [nan, 2015-12-03] | [nan, 2021-05-17] | [nan, re3data_r3d100011894] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | {FAIRsharing, re3data} |
3 | dedup::0064f599ed0adb5870a5b3ffe438e485 | [16034, opendoar____::d1f157379ea7e51d4a8c07af... | [16034, 9647] | [Giresun University Institutional Repository, ... | [roar, OpenDOAR] | [roar_16034, OpenDOAR_9647] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, 9647] | [nan, {"name": "giresun university institution... | [nan, [{'acronym': 'dspace@giresun'}, {'name':... | [nan, http://acikerisim.giresun.edu.tr] | [nan, this site provides access to the researc... | [nan, institutional] | [nan, ["tr"]] | [nan, 2021-05-21 18:05:06] | [nan, 2020-06-02 09:14:18] | [nan, [multidisciplinary]] | [nan, [journal_articles]] | [nan, [{'name': 'giresun university', 'alterna... | [nan, []] | [nan, {"name": "dspace", "version": "6.2"}] | [nan, http://acikerisim.giresun.edu.tr/oai/req... | [nan, yes] | [nan, OpenDOAR_9647] | [16034, nan] | [7, nan] | [archive, nan] | [12932, nan] | [nan, nan] | [nan, nan] | [disk0/00/01/60/34, nan] | [2020-06-01 20:13:50, nan] | [2020-06-01 20:14:04, nan] | [2020-06-01 20:13:50, nan] | [institutional, nan] | [nan, nan] | [nan, nan] | [show, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [https://acikerisim.giresun.edu.tr, nan] | [Giresun University Institutional Repository, ... | [https://acikerisim.giresun.edu.tr/oai, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [DSpace@Giresun is a growing collection of Gir... | [TRUE, nan] | [TRUE, nan] | [TRUE, nan] | [Giresun University, nan] | [https://www.giresun.edu.tr/, nan] | [tr, nan] | [Giresun, nan] | [40.9147, nan] | [38.323, nan] | [dspace, nan] | [geoname_2_TR, nan] | [other, nan] | [nan, nan] | [2020-05-29 18:13:17, nan] | [DSpace@Giresun is a growing collection of Gir... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roarmap, nan] | [http://roarmap.eprints.org/1046/, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_16034, nan] | {roar, OpenDOAR} |
4 | dedup::00ac8ed3b4327bdd4ebbebcb2ba10a00 | [610, opendoar____::299fb2142d7de959380f91c01c... | [610, 1426] | [Hedatuz, hedatuz] | [roar, OpenDOAR] | [roar_610, OpenDOAR_1426] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, 1426] | [nan, {"name": "hedatuz", "language": "en"}] | [nan, []] | [nan, http://hedatuz.euskomedia.org/] | [nan, this site contains works published by eu... | [nan, disciplinary] | [nan, ["eu", "fr", "es", "en"]] | [nan, 2019-10-17 14:34:21] | [nan, 2009-02-02 13:13:26] | [nan, [multidisciplinary]] | [nan, [journal_articles, books_chapters_and_se... | [nan, [{'name': 'euskomedia', 'alternativeName... | [nan, []] | [nan, {"name": "eprints", "version": "3.0.5"}] | [nan, http://hedatuz.euskomedia.org/cgi/oai2] | [nan, yes] | [nan, OpenDOAR_1426] | [610, nan] | [514, nan] | [archive, nan] | [1, nan] | [nan, nan] | [nan, nan] | [disk0/00/00/06/10, nan] | [2010-01-06 13:44:32, nan] | [2011-07-18 05:48:34, nan] | [2010-01-06 13:44:32, nan] | [institutional, nan] | [nan, nan] | [nan, nan] | [show, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [0, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [http://hedatuz.euskomedia.org/, nan] | [Hedatuz, nan] | [http://hedatuz.euskomedia.org/cgi/oai2, nan] | [nan, nan] | [http://hedatuz.euskomedia.org/cgi/latest_tool... | [nan, nan] | [Hedatuz, created by the Euskomedia Fundazioa,... | [TRUE, nan] | [TRUE, nan] | [nan, nan] | [Euskomedia Fundazioa, nan] | [http://www.euskomedia.org, nan] | [org, nan] | [ (Unknown city), nan] | [nan, nan] | [nan, nan] | [eprints, nan] | [geoname_2_ORG, nan] | [eprints-3.0.5, nan] | [nan, nan] | [2008-10-03 15:36:07, nan] | [nan, nan] | [nan, nan] | [0, nan] | [0, nan] | [0, nan] | [100, nan] | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,90,90,91,... | [nan, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [[opendoar, celestial], nan] | [[1294, 1426], nan] | [nan, nan] | [nan, nan] | [nan, nan] | [570, nan] | [331, nan] | [519, nan] | [145, nan] | [806, nan] | [nan, nan] | [nan, nan] | [nan, nan] | [roar_610, nan] | {roar, OpenDOAR} |
In [30]:
def remove_nan(list_obj):
if isinstance(list_obj, list):
while np.nan in list_obj:
list_obj.remove(np.nan)
return list_obj
dup_within.applymap(remove_nan).to_csv('../data/processed/dup_within.csv')
dup_hybrid.applymap(remove_nan).to_csv('../data/processed/dup_hybrid.csv')
dup_across.applymap(remove_nan).to_csv('../data/processed/dup_across.csv')